// SPDX-License-Identifier: GPL-2.0-only // SPDX-License-Identifier: GPL-2.0-only /* /* * kernel/sched/core.c * kernel/sched/core.c * * * Core kernel scheduler code and related syscalls * Core kernel scheduler code and related syscalls * * * Copyright (C) 1991-2002 Linus Torvalds * Copyright (C) 1991-2002 Linus Torvalds */ */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef CONFIG_PREEMPT_DYNAMIC #ifdef CONFIG_PREEMPT_DYNAMIC # ifdef CONFIG_GENERIC_ENTRY # ifdef CONFIG_GENERIC_ENTRY # include # include # endif # endif #endif #endif #include #include #include #include #include #include #include #include #define CREATE_TRACE_POINTS #define CREATE_TRACE_POINTS #include #include #include #include #include #include #undef CREATE_TRACE_POINTS #undef CREATE_TRACE_POINTS #include "sched.h" #include "sched.h" #include "stats.h" #include "stats.h" #include "autogroup.h" #include "autogroup.h" #include "autogroup.h" #include "autogroup.h" #include "pelt.h" #include "pelt.h" #include "smp.h" #include "smp.h" #include "stats.h" #include "stats.h" #include "../workqueue_internal.h" #include "../workqueue_internal.h" #include "../../io_uring/io-wq.h" #include "../../io_uring/io-wq.h" #include "../smpboot.h" #include "../smpboot.h" EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); /* /* * Export tracepoints that act as a bare tracehook (ie: have * Export tracepoints that act as a bare tracehook (ie: have * associated with them) to allow external modules to probe t * associated with them) to allow external modules to probe t */ */ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG /* /* * Debugging: various feature bits * Debugging: various feature bits * * * If SCHED_DEBUG is disabled, each compilation unit has its * If SCHED_DEBUG is disabled, each compilation unit has its * sysctl_sched_features, defined in sched.h, to allow consta * sysctl_sched_features, defined in sched.h, to allow consta * at compile time and compiler optimization based on feature * at compile time and compiler optimization based on feature */ */ #define SCHED_FEAT(name, enabled) \ #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | (1UL << __SCHED_FEAT_##name) * enabled | const_debug unsigned int sysctl_sched_features = const_debug unsigned int sysctl_sched_features = #include "features.h" #include "features.h" 0; 0; #undef SCHED_FEAT #undef SCHED_FEAT /* /* * Print a warning if need_resched is set for the given durat * Print a warning if need_resched is set for the given durat * LATENCY_WARN is enabled). * LATENCY_WARN is enabled). * * * If sysctl_resched_latency_warn_once is set, only one warni * If sysctl_resched_latency_warn_once is set, only one warni * per boot. * per boot. */ */ __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_ms = 100; __read_mostly int sysctl_resched_latency_warn_once = 1; __read_mostly int sysctl_resched_latency_warn_once = 1; #endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_SCHED_DEBUG */ /* /* * Number of tasks to iterate in a single balance run. * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. * Limited because this is done with IRQs disabled. */ */ const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_M const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_M __read_mostly int scheduler_running; __read_mostly int scheduler_running; #ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); /* kernel prio, less is more */ /* kernel prio, less is more */ static inline int __task_prio(const struct task_struct *p) static inline int __task_prio(const struct task_struct *p) { { if (p->sched_class == &stop_sched_class) /* trumps de if (p->sched_class == &stop_sched_class) /* trumps de return -2; return -2; if (rt_prio(p->prio)) /* includes deadline */ if (rt_prio(p->prio)) /* includes deadline */ return p->prio; /* [-1, 99] */ return p->prio; /* [-1, 99] */ if (p->sched_class == &idle_sched_class) if (p->sched_class == &idle_sched_class) return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ } } /* /* * l(a,b) * l(a,b) * le(a,b) := !l(b,a) * le(a,b) := !l(b,a) * g(a,b) := l(b,a) * g(a,b) := l(b,a) * ge(a,b) := !l(a,b) * ge(a,b) := !l(a,b) */ */ /* real prio, less is less */ /* real prio, less is less */ static inline bool prio_less(const struct task_struct *a, static inline bool prio_less(const struct task_struct *a, const struct task_struct *b, boo const struct task_struct *b, boo { { int pa = __task_prio(a), pb = __task_prio(b); int pa = __task_prio(a), pb = __task_prio(b); if (-pa < -pb) if (-pa < -pb) return true; return true; if (-pb < -pa) if (-pb < -pa) return false; return false; if (pa == -1) /* dl_prio() doesn't work because of st if (pa == -1) /* dl_prio() doesn't work because of st return !dl_time_before(a->dl.deadline, b->dl. return !dl_time_before(a->dl.deadline, b->dl. if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); return cfs_prio_less(a, b, in_fi); return false; return false; } } static inline bool __sched_core_less(const struct task_struct static inline bool __sched_core_less(const struct task_struct const struct task_struct const struct task_struct { { if (a->core_cookie < b->core_cookie) if (a->core_cookie < b->core_cookie) return true; return true; if (a->core_cookie > b->core_cookie) if (a->core_cookie > b->core_cookie) return false; return false; /* flip prio, so high prio is leftmost */ /* flip prio, so high prio is leftmost */ if (prio_less(b, a, !!task_rq(a)->core->core_forceidl if (prio_less(b, a, !!task_rq(a)->core->core_forceidl return true; return true; return false; return false; } } #define __node_2_sc(node) rb_entry((node), struct task_struct #define __node_2_sc(node) rb_entry((node), struct task_struct static inline bool rb_sched_core_less(struct rb_node *a, cons static inline bool rb_sched_core_less(struct rb_node *a, cons { { return __sched_core_less(__node_2_sc(a), __node_2_sc( return __sched_core_less(__node_2_sc(a), __node_2_sc( } } static inline int rb_sched_core_cmp(const void *key, const st static inline int rb_sched_core_cmp(const void *key, const st { { const struct task_struct *p = __node_2_sc(node); const struct task_struct *p = __node_2_sc(node); unsigned long cookie = (unsigned long)key; unsigned long cookie = (unsigned long)key; if (cookie < p->core_cookie) if (cookie < p->core_cookie) return -1; return -1; if (cookie > p->core_cookie) if (cookie > p->core_cookie) return 1; return 1; return 0; return 0; } } void sched_core_enqueue(struct rq *rq, struct task_struct *p) void sched_core_enqueue(struct rq *rq, struct task_struct *p) { { rq->core->core_task_seq++; rq->core->core_task_seq++; if (!p->core_cookie) if (!p->core_cookie) return; return; rb_add(&p->core_node, &rq->core_tree, rb_sched_core_l rb_add(&p->core_node, &rq->core_tree, rb_sched_core_l } } void sched_core_dequeue(struct rq *rq, struct task_struct *p, void sched_core_dequeue(struct rq *rq, struct task_struct *p, { { rq->core->core_task_seq++; rq->core->core_task_seq++; if (sched_core_enqueued(p)) { if (sched_core_enqueued(p)) { rb_erase(&p->core_node, &rq->core_tree); rb_erase(&p->core_node, &rq->core_tree); RB_CLEAR_NODE(&p->core_node); RB_CLEAR_NODE(&p->core_node); } } /* /* * Migrating the last task off the cpu, with the cpu * Migrating the last task off the cpu, with the cpu * state. Reschedule to create an accounting edge for * state. Reschedule to create an accounting edge for * and re-examine whether the core is still in forced * and re-examine whether the core is still in forced */ */ if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 && rq->core->core_forceidle_count && rq->curr == rq- rq->core->core_forceidle_count && rq->curr == rq- resched_curr(rq); resched_curr(rq); } } static int sched_task_is_throttled(struct task_struct *p, int static int sched_task_is_throttled(struct task_struct *p, int { { if (p->sched_class->task_is_throttled) if (p->sched_class->task_is_throttled) return p->sched_class->task_is_throttled(p, c return p->sched_class->task_is_throttled(p, c return 0; return 0; } } static struct task_struct *sched_core_next(struct task_struct static struct task_struct *sched_core_next(struct task_struct { { struct rb_node *node = &p->core_node; struct rb_node *node = &p->core_node; int cpu = task_cpu(p); int cpu = task_cpu(p); do { do { node = rb_next(node); node = rb_next(node); if (!node) if (!node) return NULL; return NULL; p = __node_2_sc(node); p = __node_2_sc(node); if (p->core_cookie != cookie) if (p->core_cookie != cookie) return NULL; return NULL; } while (sched_task_is_throttled(p, cpu)); } while (sched_task_is_throttled(p, cpu)); return p; return p; } } /* /* * Find left-most (aka, highest priority) and unthrottled tas * Find left-most (aka, highest priority) and unthrottled tas * If no suitable task is found, NULL will be returned. * If no suitable task is found, NULL will be returned. */ */ static struct task_struct *sched_core_find(struct rq *rq, uns static struct task_struct *sched_core_find(struct rq *rq, uns { { struct task_struct *p; struct task_struct *p; struct rb_node *node; struct rb_node *node; node = rb_find_first((void *)cookie, &rq->core_tree, node = rb_find_first((void *)cookie, &rq->core_tree, if (!node) if (!node) return NULL; return NULL; p = __node_2_sc(node); p = __node_2_sc(node); if (!sched_task_is_throttled(p, rq->cpu)) if (!sched_task_is_throttled(p, rq->cpu)) return p; return p; return sched_core_next(p, cookie); return sched_core_next(p, cookie); } } /* /* * Magic required such that: * Magic required such that: * * * raw_spin_rq_lock(rq); * raw_spin_rq_lock(rq); * ... * ... * raw_spin_rq_unlock(rq); * raw_spin_rq_unlock(rq); * * * ends up locking and unlocking the _same_ lock, and all CPU * ends up locking and unlocking the _same_ lock, and all CPU * always agree on what rq has what lock. * always agree on what rq has what lock. * * * XXX entirely possible to selectively enable cores, don't b * XXX entirely possible to selectively enable cores, don't b */ */ static DEFINE_MUTEX(sched_core_mutex); static DEFINE_MUTEX(sched_core_mutex); static atomic_t sched_core_count; static atomic_t sched_core_count; static struct cpumask sched_core_mask; static struct cpumask sched_core_mask; static void sched_core_lock(int cpu, unsigned long *flags) static void sched_core_lock(int cpu, unsigned long *flags) { { const struct cpumask *smt_mask = cpu_smt_mask(cpu); const struct cpumask *smt_mask = cpu_smt_mask(cpu); int t, i = 0; int t, i = 0; local_irq_save(*flags); local_irq_save(*flags); for_each_cpu(t, smt_mask) for_each_cpu(t, smt_mask) raw_spin_lock_nested(&cpu_rq(t)->__lock, i++) raw_spin_lock_nested(&cpu_rq(t)->__lock, i++) } } static void sched_core_unlock(int cpu, unsigned long *flags) static void sched_core_unlock(int cpu, unsigned long *flags) { { const struct cpumask *smt_mask = cpu_smt_mask(cpu); const struct cpumask *smt_mask = cpu_smt_mask(cpu); int t; int t; for_each_cpu(t, smt_mask) for_each_cpu(t, smt_mask) raw_spin_unlock(&cpu_rq(t)->__lock); raw_spin_unlock(&cpu_rq(t)->__lock); local_irq_restore(*flags); local_irq_restore(*flags); } } static void __sched_core_flip(bool enabled) static void __sched_core_flip(bool enabled) { { unsigned long flags; unsigned long flags; int cpu, t; int cpu, t; cpus_read_lock(); cpus_read_lock(); /* /* * Toggle the online cores, one by one. * Toggle the online cores, one by one. */ */ cpumask_copy(&sched_core_mask, cpu_online_mask); cpumask_copy(&sched_core_mask, cpu_online_mask); for_each_cpu(cpu, &sched_core_mask) { for_each_cpu(cpu, &sched_core_mask) { const struct cpumask *smt_mask = cpu_smt_mask const struct cpumask *smt_mask = cpu_smt_mask sched_core_lock(cpu, &flags); sched_core_lock(cpu, &flags); for_each_cpu(t, smt_mask) for_each_cpu(t, smt_mask) cpu_rq(t)->core_enabled = enabled; cpu_rq(t)->core_enabled = enabled; cpu_rq(cpu)->core->core_forceidle_start = 0; cpu_rq(cpu)->core->core_forceidle_start = 0; sched_core_unlock(cpu, &flags); sched_core_unlock(cpu, &flags); cpumask_andnot(&sched_core_mask, &sched_core_ cpumask_andnot(&sched_core_mask, &sched_core_ } } /* /* * Toggle the offline CPUs. * Toggle the offline CPUs. */ */ for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_onlin for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_onlin cpu_rq(cpu)->core_enabled = enabled; cpu_rq(cpu)->core_enabled = enabled; cpus_read_unlock(); cpus_read_unlock(); } } static void sched_core_assert_empty(void) static void sched_core_assert_empty(void) { { int cpu; int cpu; for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->cor WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->cor } } static void __sched_core_enable(void) static void __sched_core_enable(void) { { static_branch_enable(&__sched_core_enabled); static_branch_enable(&__sched_core_enabled); /* /* * Ensure all previous instances of raw_spin_rq_*lock * Ensure all previous instances of raw_spin_rq_*lock * and future ones will observe !sched_core_disabled( * and future ones will observe !sched_core_disabled( */ */ synchronize_rcu(); synchronize_rcu(); __sched_core_flip(true); __sched_core_flip(true); sched_core_assert_empty(); sched_core_assert_empty(); } } static void __sched_core_disable(void) static void __sched_core_disable(void) { { sched_core_assert_empty(); sched_core_assert_empty(); __sched_core_flip(false); __sched_core_flip(false); static_branch_disable(&__sched_core_enabled); static_branch_disable(&__sched_core_enabled); } } void sched_core_get(void) void sched_core_get(void) { { if (atomic_inc_not_zero(&sched_core_count)) if (atomic_inc_not_zero(&sched_core_count)) return; return; mutex_lock(&sched_core_mutex); mutex_lock(&sched_core_mutex); if (!atomic_read(&sched_core_count)) if (!atomic_read(&sched_core_count)) __sched_core_enable(); __sched_core_enable(); smp_mb__before_atomic(); smp_mb__before_atomic(); atomic_inc(&sched_core_count); atomic_inc(&sched_core_count); mutex_unlock(&sched_core_mutex); mutex_unlock(&sched_core_mutex); } } static void __sched_core_put(struct work_struct *work) static void __sched_core_put(struct work_struct *work) { { if (atomic_dec_and_mutex_lock(&sched_core_count, &sch if (atomic_dec_and_mutex_lock(&sched_core_count, &sch __sched_core_disable(); __sched_core_disable(); mutex_unlock(&sched_core_mutex); mutex_unlock(&sched_core_mutex); } } } } void sched_core_put(void) void sched_core_put(void) { { static DECLARE_WORK(_work, __sched_core_put); static DECLARE_WORK(_work, __sched_core_put); /* /* * "There can be only one" * "There can be only one" * * * Either this is the last one, or we don't actually * Either this is the last one, or we don't actually * 'work'. If it is the last *again*, we rely on * 'work'. If it is the last *again*, we rely on * WORK_STRUCT_PENDING_BIT. * WORK_STRUCT_PENDING_BIT. */ */ if (!atomic_add_unless(&sched_core_count, -1, 1)) if (!atomic_add_unless(&sched_core_count, -1, 1)) schedule_work(&_work); schedule_work(&_work); } } #else /* !CONFIG_SCHED_CORE */ #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_enqueue(struct rq *rq, struct t static inline void sched_core_enqueue(struct rq *rq, struct t static inline void static inline void sched_core_dequeue(struct rq *rq, struct task_struct *p, int sched_core_dequeue(struct rq *rq, struct task_struct *p, int #endif /* CONFIG_SCHED_CORE */ #endif /* CONFIG_SCHED_CORE */ /* /* * Serialization rules: * Serialization rules: * * * Lock order: * Lock order: * * * p->pi_lock * p->pi_lock * rq->lock * rq->lock * hrtimer_cpu_base->lock (hrtimer_start() for bandwidt * hrtimer_cpu_base->lock (hrtimer_start() for bandwidt * * * rq1->lock * rq1->lock * rq2->lock where: rq1 < rq2 * rq2->lock where: rq1 < rq2 * * * Regular state: * Regular state: * * * Normal scheduling state is serialized by rq->lock. __sched * Normal scheduling state is serialized by rq->lock. __sched * local CPU's rq->lock, it optionally removes the task from * local CPU's rq->lock, it optionally removes the task from * always looks at the local rq data structures to find the m * always looks at the local rq data structures to find the m * to run next. * to run next. * * * Task enqueue is also under rq->lock, possibly taken from a * Task enqueue is also under rq->lock, possibly taken from a * Wakeups from another LLC domain might use an IPI to transf * Wakeups from another LLC domain might use an IPI to transf * the local CPU to avoid bouncing the runqueue state around * the local CPU to avoid bouncing the runqueue state around * ttwu_queue_wakelist() ] * ttwu_queue_wakelist() ] * * * Task wakeup, specifically wakeups that involve migration, * Task wakeup, specifically wakeups that involve migration, * complicated to avoid having to take two rq->locks. * complicated to avoid having to take two rq->locks. * * * Special state: * Special state: * * * System-calls and anything external will use task_rq_lock() * System-calls and anything external will use task_rq_lock() * both p->pi_lock and rq->lock. As a consequence the state t * both p->pi_lock and rq->lock. As a consequence the state t * stable while holding either lock: * stable while holding either lock: * * * - sched_setaffinity()/ * - sched_setaffinity()/ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allow * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allow * - set_user_nice(): p->se.load, p->*prio * - set_user_nice(): p->se.load, p->*prio * - __sched_setscheduler(): p->sched_class, p->policy, p- * - __sched_setscheduler(): p->sched_class, p->policy, p- * p->se.load, p->rt_priority, * p->se.load, p->rt_priority, * p->dl.dl_{runtime, deadline, * p->dl.dl_{runtime, deadline, * - sched_setnuma(): p->numa_preferred_nid * - sched_setnuma(): p->numa_preferred_nid * - sched_move_task(): p->sched_task_group * - sched_move_task(): p->sched_task_group * - uclamp_update_active() p->uclamp* * - uclamp_update_active() p->uclamp* * * * p->state <- TASK_*: * p->state <- TASK_*: * * * is changed locklessly using set_current_state(), __set_c * is changed locklessly using set_current_state(), __set_c * set_special_state(), see their respective comments, or b * set_special_state(), see their respective comments, or b * try_to_wake_up(). This latter uses p->pi_lock to seriali * try_to_wake_up(). This latter uses p->pi_lock to seriali * concurrent self. * concurrent self. * * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIG * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIG * * * is set by activate_task() and cleared by deactivate_task * is set by activate_task() and cleared by deactivate_task * rq->lock. Non-zero indicates the task is runnable, the s * rq->lock. Non-zero indicates the task is runnable, the s * ON_RQ_MIGRATING state is used for migration without hold * ON_RQ_MIGRATING state is used for migration without hold * rq->locks. It indicates task_cpu() is not stable, see ta * rq->locks. It indicates task_cpu() is not stable, see ta * * * p->on_cpu <- { 0, 1 }: * p->on_cpu <- { 0, 1 }: * * * is set by prepare_task() and cleared by finish_task() su * is set by prepare_task() and cleared by finish_task() su * set before p is scheduled-in and cleared after p is sche * set before p is scheduled-in and cleared after p is sche * under rq->lock. Non-zero indicates the task is running o * under rq->lock. Non-zero indicates the task is running o * * * [ The astute reader will observe that it is possible for * [ The astute reader will observe that it is possible for * CPU to have ->on_cpu = 1 at the same time. ] * CPU to have ->on_cpu = 1 at the same time. ] * * * task_cpu(p): is changed by set_task_cpu(), the rules are: * task_cpu(p): is changed by set_task_cpu(), the rules are: * * * - Don't call set_task_cpu() on a blocked task: * - Don't call set_task_cpu() on a blocked task: * * * We don't care what CPU we're not running on, this simpl * We don't care what CPU we're not running on, this simpl * the CPU assignment of blocked tasks isn't required to b * the CPU assignment of blocked tasks isn't required to b * * * - for try_to_wake_up(), called under p->pi_lock: * - for try_to_wake_up(), called under p->pi_lock: * * * This allows try_to_wake_up() to only take one rq->lock, * This allows try_to_wake_up() to only take one rq->lock, * * * - for migration called under rq->lock: * - for migration called under rq->lock: * [ see task_on_rq_migrating() in task_rq_lock() ] * [ see task_on_rq_migrating() in task_rq_lock() ] * * * o move_queued_task() * o move_queued_task() * o detach_task() * o detach_task() * * * - for migration called under double_rq_lock(): * - for migration called under double_rq_lock(): * * * o __migrate_swap_task() * o __migrate_swap_task() * o push_rt_task() / pull_rt_task() * o push_rt_task() / pull_rt_task() * o push_dl_task() / pull_dl_task() * o push_dl_task() / pull_dl_task() * o dl_task_offline_migration() * o dl_task_offline_migration() * * */ */ void raw_spin_rq_lock_nested(struct rq *rq, int subclass) void raw_spin_rq_lock_nested(struct rq *rq, int subclass) { { raw_spinlock_t *lock; raw_spinlock_t *lock; /* Matches synchronize_rcu() in __sched_core_enable() /* Matches synchronize_rcu() in __sched_core_enable() preempt_disable(); preempt_disable(); if (sched_core_disabled()) { if (sched_core_disabled()) { raw_spin_lock_nested(&rq->__lock, subclass); raw_spin_lock_nested(&rq->__lock, subclass); /* preempt_count *MUST* be > 1 */ /* preempt_count *MUST* be > 1 */ preempt_enable_no_resched(); preempt_enable_no_resched(); return; return; } } for (;;) { for (;;) { lock = __rq_lockp(rq); lock = __rq_lockp(rq); raw_spin_lock_nested(lock, subclass); raw_spin_lock_nested(lock, subclass); if (likely(lock == __rq_lockp(rq))) { if (likely(lock == __rq_lockp(rq))) { /* preempt_count *MUST* be > 1 */ /* preempt_count *MUST* be > 1 */ preempt_enable_no_resched(); preempt_enable_no_resched(); return; return; } } raw_spin_unlock(lock); raw_spin_unlock(lock); } } } } bool raw_spin_rq_trylock(struct rq *rq) bool raw_spin_rq_trylock(struct rq *rq) { { raw_spinlock_t *lock; raw_spinlock_t *lock; bool ret; bool ret; /* Matches synchronize_rcu() in __sched_core_enable() /* Matches synchronize_rcu() in __sched_core_enable() preempt_disable(); preempt_disable(); if (sched_core_disabled()) { if (sched_core_disabled()) { ret = raw_spin_trylock(&rq->__lock); ret = raw_spin_trylock(&rq->__lock); preempt_enable(); preempt_enable(); return ret; return ret; } } for (;;) { for (;;) { lock = __rq_lockp(rq); lock = __rq_lockp(rq); ret = raw_spin_trylock(lock); ret = raw_spin_trylock(lock); if (!ret || (likely(lock == __rq_lockp(rq)))) if (!ret || (likely(lock == __rq_lockp(rq)))) preempt_enable(); preempt_enable(); return ret; return ret; } } raw_spin_unlock(lock); raw_spin_unlock(lock); } } } } void raw_spin_rq_unlock(struct rq *rq) void raw_spin_rq_unlock(struct rq *rq) { { raw_spin_unlock(rq_lockp(rq)); raw_spin_unlock(rq_lockp(rq)); } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * double_rq_lock - safely lock two runqueues * double_rq_lock - safely lock two runqueues */ */ void double_rq_lock(struct rq *rq1, struct rq *rq2) void double_rq_lock(struct rq *rq1, struct rq *rq2) { { lockdep_assert_irqs_disabled(); lockdep_assert_irqs_disabled(); if (rq_order_less(rq2, rq1)) if (rq_order_less(rq2, rq1)) swap(rq1, rq2); swap(rq1, rq2); raw_spin_rq_lock(rq1); raw_spin_rq_lock(rq1); if (__rq_lockp(rq1) != __rq_lockp(rq2)) if (__rq_lockp(rq1) != __rq_lockp(rq2)) raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NES raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NES double_rq_clock_clear_update(rq1, rq2); double_rq_clock_clear_update(rq1, rq2); } } #endif #endif /* /* * __task_rq_lock - lock the rq @p resides on. * __task_rq_lock - lock the rq @p resides on. */ */ struct rq *__task_rq_lock(struct task_struct *p, struct rq_fl struct rq *__task_rq_lock(struct task_struct *p, struct rq_fl __acquires(rq->lock) __acquires(rq->lock) { { struct rq *rq; struct rq *rq; lockdep_assert_held(&p->pi_lock); lockdep_assert_held(&p->pi_lock); for (;;) { for (;;) { rq = task_rq(p); rq = task_rq(p); raw_spin_rq_lock(rq); raw_spin_rq_lock(rq); if (likely(rq == task_rq(p) && !task_on_rq_mi if (likely(rq == task_rq(p) && !task_on_rq_mi rq_pin_lock(rq, rf); rq_pin_lock(rq, rf); return rq; return rq; } } raw_spin_rq_unlock(rq); raw_spin_rq_unlock(rq); while (unlikely(task_on_rq_migrating(p))) while (unlikely(task_on_rq_migrating(p))) cpu_relax(); cpu_relax(); } } } } /* /* * task_rq_lock - lock p->pi_lock and lock the rq @p resides * task_rq_lock - lock p->pi_lock and lock the rq @p resides */ */ struct rq *task_rq_lock(struct task_struct *p, struct rq_flag struct rq *task_rq_lock(struct task_struct *p, struct rq_flag __acquires(p->pi_lock) __acquires(p->pi_lock) __acquires(rq->lock) __acquires(rq->lock) { { struct rq *rq; struct rq *rq; for (;;) { for (;;) { raw_spin_lock_irqsave(&p->pi_lock, rf->flags) raw_spin_lock_irqsave(&p->pi_lock, rf->flags) rq = task_rq(p); rq = task_rq(p); raw_spin_rq_lock(rq); raw_spin_rq_lock(rq); /* /* * move_queued_task() task_ * move_queued_task() task_ * * * ACQUIRE (rq->lock) * ACQUIRE (rq->lock) * [S] ->on_rq = MIGRATING [L] r * [S] ->on_rq = MIGRATING [L] r * WMB (__set_task_cpu()) ACQUI * WMB (__set_task_cpu()) ACQUI * [S] ->cpu = new_cpu [L] t * [S] ->cpu = new_cpu [L] t * [L] - * [L] - * RELEASE (rq->lock) * RELEASE (rq->lock) * * * If we observe the old CPU in task_rq_lock( * If we observe the old CPU in task_rq_lock( * the old rq->lock will fully serialize agai * the old rq->lock will fully serialize agai * * * If we observe the new CPU in task_rq_lock( * If we observe the new CPU in task_rq_lock( * dependency headed by '[L] rq = task_rq()' * dependency headed by '[L] rq = task_rq()' * will pair with the WMB to ensure we then a * will pair with the WMB to ensure we then a */ */ if (likely(rq == task_rq(p) && !task_on_rq_mi if (likely(rq == task_rq(p) && !task_on_rq_mi rq_pin_lock(rq, rf); rq_pin_lock(rq, rf); return rq; return rq; } } raw_spin_rq_unlock(rq); raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, rf->f raw_spin_unlock_irqrestore(&p->pi_lock, rf->f while (unlikely(task_on_rq_migrating(p))) while (unlikely(task_on_rq_migrating(p))) cpu_relax(); cpu_relax(); } } } } /* /* * RQ-clock updating methods: * RQ-clock updating methods: */ */ static void update_rq_clock_task(struct rq *rq, s64 delta) static void update_rq_clock_task(struct rq *rq, s64 delta) { { /* /* * In theory, the compile should just see 0 here, and optimiz * In theory, the compile should just see 0 here, and optimiz * to sched_rt_avg_update. But I don't trust it... * to sched_rt_avg_update. But I don't trust it... */ */ s64 __maybe_unused steal = 0, irq_delta = 0; s64 __maybe_unused steal = 0, irq_delta = 0; #ifdef CONFIG_IRQ_TIME_ACCOUNTING #ifdef CONFIG_IRQ_TIME_ACCOUNTING irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_ /* /* * Since irq_time is only updated on {soft,}irq_exit, * Since irq_time is only updated on {soft,}irq_exit, * this case when a previous update_rq_clock() happen * this case when a previous update_rq_clock() happen * {soft,}irq region. * {soft,}irq region. * * * When this happens, we stop ->clock_task and only u * When this happens, we stop ->clock_task and only u * prev_irq_time stamp to account for the part that f * prev_irq_time stamp to account for the part that f * update will consume the rest. This ensures ->clock * update will consume the rest. This ensures ->clock * monotonic. * monotonic. * * * It does however cause some slight miss-attribution * It does however cause some slight miss-attribution * time, a more accurate solution would be to update * time, a more accurate solution would be to update * the current rq->clock timestamp, except that would * the current rq->clock timestamp, except that would * atomic ops. * atomic ops. */ */ if (irq_delta > delta) if (irq_delta > delta) irq_delta = delta; irq_delta = delta; rq->prev_irq_time += irq_delta; rq->prev_irq_time += irq_delta; delta -= irq_delta; delta -= irq_delta; psi_account_irqtime(rq->curr, irq_delta); psi_account_irqtime(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); delayacct_irq(rq->curr, irq_delta); #endif #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { if (static_key_false((¶virt_steal_rq_enabled))) { steal = paravirt_steal_clock(cpu_of(rq)); steal = paravirt_steal_clock(cpu_of(rq)); steal -= rq->prev_steal_time_rq; steal -= rq->prev_steal_time_rq; if (unlikely(steal > delta)) if (unlikely(steal > delta)) steal = delta; steal = delta; rq->prev_steal_time_rq += steal; rq->prev_steal_time_rq += steal; delta -= steal; delta -= steal; } } #endif #endif rq->clock_task += delta; rq->clock_task += delta; #ifdef CONFIG_HAVE_SCHED_AVG_IRQ #ifdef CONFIG_HAVE_SCHED_AVG_IRQ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACIT if ((irq_delta + steal) && sched_feat(NONTASK_CAPACIT update_irq_load_avg(rq, irq_delta + steal); update_irq_load_avg(rq, irq_delta + steal); #endif #endif update_rq_clock_pelt(rq, delta); update_rq_clock_pelt(rq, delta); } } void update_rq_clock(struct rq *rq) void update_rq_clock(struct rq *rq) { { s64 delta; s64 delta; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); if (rq->clock_update_flags & RQCF_ACT_SKIP) if (rq->clock_update_flags & RQCF_ACT_SKIP) return; return; #ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG if (sched_feat(WARN_DOUBLE_CLOCK)) if (sched_feat(WARN_DOUBLE_CLOCK)) SCHED_WARN_ON(rq->clock_update_flags & RQCF_U SCHED_WARN_ON(rq->clock_update_flags & RQCF_U rq->clock_update_flags |= RQCF_UPDATED; rq->clock_update_flags |= RQCF_UPDATED; #endif #endif delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; if (delta < 0) if (delta < 0) return; return; rq->clock += delta; rq->clock += delta; update_rq_clock_task(rq, delta); update_rq_clock_task(rq, delta); } } #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SCHED_HRTICK /* /* * Use HR-timers to deliver accurate preemption points. * Use HR-timers to deliver accurate preemption points. */ */ static void hrtick_clear(struct rq *rq) static void hrtick_clear(struct rq *rq) { { if (hrtimer_active(&rq->hrtick_timer)) if (hrtimer_active(&rq->hrtick_timer)) hrtimer_cancel(&rq->hrtick_timer); hrtimer_cancel(&rq->hrtick_timer); } } /* /* * High-resolution timer tick. * High-resolution timer tick. * Runs from hardirq context with interrupts disabled. * Runs from hardirq context with interrupts disabled. */ */ static enum hrtimer_restart hrtick(struct hrtimer *timer) static enum hrtimer_restart hrtick(struct hrtimer *timer) { { struct rq *rq = container_of(timer, struct rq, hrtick struct rq *rq = container_of(timer, struct rq, hrtick struct rq_flags rf; struct rq_flags rf; WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); rq_lock(rq, &rf); rq_lock(rq, &rf); update_rq_clock(rq); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); rq->curr->sched_class->task_tick(rq, rq->curr, 1); rq_unlock(rq, &rf); rq_unlock(rq, &rf); return HRTIMER_NORESTART; return HRTIMER_NORESTART; } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP static void __hrtick_restart(struct rq *rq) static void __hrtick_restart(struct rq *rq) { { struct hrtimer *timer = &rq->hrtick_timer; struct hrtimer *timer = &rq->hrtick_timer; ktime_t time = rq->hrtick_time; ktime_t time = rq->hrtick_time; hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HA hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HA } } /* /* * called from hardirq (IPI) context * called from hardirq (IPI) context */ */ static void __hrtick_start(void *arg) static void __hrtick_start(void *arg) { { struct rq *rq = arg; struct rq *rq = arg; struct rq_flags rf; struct rq_flags rf; rq_lock(rq, &rf); rq_lock(rq, &rf); __hrtick_restart(rq); __hrtick_restart(rq); rq_unlock(rq, &rf); rq_unlock(rq, &rf); } } /* /* * Called to set the hrtick timer state. * Called to set the hrtick timer state. * * * called with rq->lock held and irqs disabled * called with rq->lock held and irqs disabled */ */ void hrtick_start(struct rq *rq, u64 delay) void hrtick_start(struct rq *rq, u64 delay) { { struct hrtimer *timer = &rq->hrtick_timer; struct hrtimer *timer = &rq->hrtick_timer; s64 delta; s64 delta; /* /* * Don't schedule slices shorter than 10000ns, that j * Don't schedule slices shorter than 10000ns, that j * doesn't make sense and can cause timer DoS. * doesn't make sense and can cause timer DoS. */ */ delta = max_t(s64, delay, 10000LL); delta = max_t(s64, delay, 10000LL); rq->hrtick_time = ktime_add_ns(timer->base->get_time( rq->hrtick_time = ktime_add_ns(timer->base->get_time( if (rq == this_rq()) if (rq == this_rq()) __hrtick_restart(rq); __hrtick_restart(rq); else else smp_call_function_single_async(cpu_of(rq), &r smp_call_function_single_async(cpu_of(rq), &r } } #else #else /* /* * Called to set the hrtick timer state. * Called to set the hrtick timer state. * * * called with rq->lock held and irqs disabled * called with rq->lock held and irqs disabled */ */ void hrtick_start(struct rq *rq, u64 delay) void hrtick_start(struct rq *rq, u64 delay) { { /* /* * Don't schedule slices shorter than 10000ns, that j * Don't schedule slices shorter than 10000ns, that j * doesn't make sense. Rely on vruntime for fairness. * doesn't make sense. Rely on vruntime for fairness. */ */ delay = max_t(u64, delay, 10000LL); delay = max_t(u64, delay, 10000LL); hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL_PINNED_HARD); HRTIMER_MODE_REL_PINNED_HARD); } } #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ static void hrtick_rq_init(struct rq *rq) static void hrtick_rq_init(struct rq *rq) { { #ifdef CONFIG_SMP #ifdef CONFIG_SMP INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif #endif hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTI hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTI rq->hrtick_timer.function = hrtick; rq->hrtick_timer.function = hrtick; } } #else /* CONFIG_SCHED_HRTICK */ #else /* CONFIG_SCHED_HRTICK */ static inline void hrtick_clear(struct rq *rq) static inline void hrtick_clear(struct rq *rq) { { } } static inline void hrtick_rq_init(struct rq *rq) static inline void hrtick_rq_init(struct rq *rq) { { } } #endif /* CONFIG_SCHED_HRTICK */ #endif /* CONFIG_SCHED_HRTICK */ /* /* * cmpxchg based fetch_or, macro so it works for different in * cmpxchg based fetch_or, macro so it works for different in */ */ #define fetch_or(ptr, mask) #define fetch_or(ptr, mask) ({ ({ typeof(ptr) _ptr = (ptr); typeof(ptr) _ptr = (ptr); typeof(mask) _mask = (mask); typeof(mask) _mask = (mask); typeof(*_ptr) _val = *_ptr; typeof(*_ptr) _val = *_ptr; do { do { } while (!try_cmpxchg(_ptr, &_val, _val | _ma } while (!try_cmpxchg(_ptr, &_val, _val | _ma _val; _val; }) }) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) /* /* * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_PO | * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_N * this avoids any races wrt polling state changes and thereb * this avoids any races wrt polling state changes and thereb * spurious IPIs. * spurious IPIs. */ */ static inline bool set_nr_and_not_polling(struct task_struct | static inline bool set_nr_and_not_polling(struct task_struct { { struct thread_info *ti = task_thread_info(p); struct thread_info *ti = task_thread_info(p); | return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _T return !(fetch_or(&ti->flags, 1 << tif_bit) & _TIF_PO < } } /* /* * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is s * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is s * * * If this returns true, then the idle task promises to call * If this returns true, then the idle task promises to call * sched_ttwu_pending() and reschedule soon. * sched_ttwu_pending() and reschedule soon. */ */ static bool set_nr_if_polling(struct task_struct *p) static bool set_nr_if_polling(struct task_struct *p) { { struct thread_info *ti = task_thread_info(p); struct thread_info *ti = task_thread_info(p); typeof(ti->flags) val = READ_ONCE(ti->flags); typeof(ti->flags) val = READ_ONCE(ti->flags); for (;;) { for (;;) { if (!(val & _TIF_POLLING_NRFLAG)) if (!(val & _TIF_POLLING_NRFLAG)) return false; return false; if (val & (_TIF_NEED_RESCHED | _TIF_NEED_RESC | if (val & _TIF_NEED_RESCHED) return true; return true; if (try_cmpxchg(&ti->flags, &val, val | _TIF_ if (try_cmpxchg(&ti->flags, &val, val | _TIF_ break; break; } } return true; return true; } } #else #else static inline bool set_nr_and_not_polling(struct task_struct | static inline bool set_nr_and_not_polling(struct task_struct { { set_tsk_thread_flag(p, tif_bit); | set_tsk_need_resched(p); return true; return true; } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP static inline bool set_nr_if_polling(struct task_struct *p) static inline bool set_nr_if_polling(struct task_struct *p) { { return false; return false; } } #endif #endif #endif #endif static bool __wake_q_add(struct wake_q_head *head, struct tas static bool __wake_q_add(struct wake_q_head *head, struct tas { { struct wake_q_node *node = &task->wake_q; struct wake_q_node *node = &task->wake_q; /* /* * Atomically grab the task, if ->wake_q is !nil alre * Atomically grab the task, if ->wake_q is !nil alre * it's already queued (either by us or someone else) * it's already queued (either by us or someone else) * wakeup due to that. * wakeup due to that. * * * In order to ensure that a pending wakeup will obse * In order to ensure that a pending wakeup will obse * state, even in the failed case, an explicit smp_mb * state, even in the failed case, an explicit smp_mb */ */ smp_mb__before_atomic(); smp_mb__before_atomic(); if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_ return false; return false; /* /* * The head is context local, there can be no concurr * The head is context local, there can be no concurr */ */ *head->lastp = node; *head->lastp = node; head->lastp = &node->next; head->lastp = &node->next; return true; return true; } } /** /** * wake_q_add() - queue a wakeup for 'later' waking. * wake_q_add() - queue a wakeup for 'later' waking. * @head: the wake_q_head to add @task to * @head: the wake_q_head to add @task to * @task: the task to queue for 'later' wakeup * @task: the task to queue for 'later' wakeup * * * Queue a task for later wakeup, most likely by the wake_up_ * Queue a task for later wakeup, most likely by the wake_up_ * same context, _HOWEVER_ this is not guaranteed, the wakeup * same context, _HOWEVER_ this is not guaranteed, the wakeup * instantly. * instantly. * * * This function must be used as-if it were wake_up_process() * This function must be used as-if it were wake_up_process() * must be ready to be woken at this location. * must be ready to be woken at this location. */ */ void wake_q_add(struct wake_q_head *head, struct task_struct void wake_q_add(struct wake_q_head *head, struct task_struct { { if (__wake_q_add(head, task)) if (__wake_q_add(head, task)) get_task_struct(task); get_task_struct(task); } } /** /** * wake_q_add_safe() - safely queue a wakeup for 'later' waki * wake_q_add_safe() - safely queue a wakeup for 'later' waki * @head: the wake_q_head to add @task to * @head: the wake_q_head to add @task to * @task: the task to queue for 'later' wakeup * @task: the task to queue for 'later' wakeup * * * Queue a task for later wakeup, most likely by the wake_up_ * Queue a task for later wakeup, most likely by the wake_up_ * same context, _HOWEVER_ this is not guaranteed, the wakeup * same context, _HOWEVER_ this is not guaranteed, the wakeup * instantly. * instantly. * * * This function must be used as-if it were wake_up_process() * This function must be used as-if it were wake_up_process() * must be ready to be woken at this location. * must be ready to be woken at this location. * * * This function is essentially a task-safe equivalent to wak * This function is essentially a task-safe equivalent to wak * that already hold reference to @task can call the 'safe' v * that already hold reference to @task can call the 'safe' v * wake_q to do the right thing depending whether or not the * wake_q to do the right thing depending whether or not the * queued for wakeup. * queued for wakeup. */ */ void wake_q_add_safe(struct wake_q_head *head, struct task_st void wake_q_add_safe(struct wake_q_head *head, struct task_st { { if (!__wake_q_add(head, task)) if (!__wake_q_add(head, task)) put_task_struct(task); put_task_struct(task); } } void wake_up_q(struct wake_q_head *head) void wake_up_q(struct wake_q_head *head) { { struct wake_q_node *node = head->first; struct wake_q_node *node = head->first; while (node != WAKE_Q_TAIL) { while (node != WAKE_Q_TAIL) { struct task_struct *task; struct task_struct *task; task = container_of(node, struct task_struct, task = container_of(node, struct task_struct, /* Task can safely be re-inserted now: */ /* Task can safely be re-inserted now: */ node = node->next; node = node->next; task->wake_q.next = NULL; task->wake_q.next = NULL; /* /* * wake_up_process() executes a full barrier, * wake_up_process() executes a full barrier, * the queueing in wake_q_add() so as not to * the queueing in wake_q_add() so as not to */ */ wake_up_process(task); wake_up_process(task); put_task_struct(task); put_task_struct(task); } } } } /* /* * resched_curr - mark rq's current task 'to be rescheduled n * resched_curr - mark rq's current task 'to be rescheduled n * * * On UP this means the setting of the need_resched flag, on * On UP this means the setting of the need_resched flag, on * might also involve a cross-CPU call to trigger the schedul * might also involve a cross-CPU call to trigger the schedul * the target CPU. * the target CPU. */ */ static void __resched_curr(struct rq *rq, int lazy) | void resched_curr(struct rq *rq) { { int cpu, tif_bit = TIF_NEED_RESCHED + lazy; < struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr; > int cpu; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); if (unlikely(test_tsk_thread_flag(curr, tif_bit))) | if (test_tsk_need_resched(curr)) return; return; cpu = cpu_of(rq); cpu = cpu_of(rq); if (cpu == smp_processor_id()) { if (cpu == smp_processor_id()) { set_tsk_thread_flag(curr, tif_bit); | set_tsk_need_resched(curr); if (!lazy) | set_preempt_need_resched(); set_preempt_need_resched(); < return; return; } } if (set_nr_and_not_polling(curr, tif_bit)) { | if (set_nr_and_not_polling(curr)) if (!lazy) | smp_send_reschedule(cpu); smp_send_reschedule(cpu); | else } else { < trace_sched_wake_idle_without_ipi(cpu); trace_sched_wake_idle_without_ipi(cpu); } < } < < void resched_curr(struct rq *rq) < { < __resched_curr(rq, 0); < } < < void resched_curr_lazy(struct rq *rq) < { < int lazy = IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && ! < TIF_NEED_RESCHED_LAZY_OFFSET : 0; < < if (lazy && unlikely(test_tsk_thread_flag(rq->curr, T < return; < < __resched_curr(rq, lazy); < } } void resched_cpu(int cpu) void resched_cpu(int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); unsigned long flags; unsigned long flags; raw_spin_rq_lock_irqsave(rq, flags); raw_spin_rq_lock_irqsave(rq, flags); if (cpu_online(cpu) || cpu == smp_processor_id()) if (cpu_online(cpu) || cpu == smp_processor_id()) resched_curr(rq); resched_curr(rq); raw_spin_rq_unlock_irqrestore(rq, flags); raw_spin_rq_unlock_irqrestore(rq, flags); } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON /* /* * In the semi idle case, use the nearest busy CPU for migrat * In the semi idle case, use the nearest busy CPU for migrat * from an idle CPU. This is good for power-savings. * from an idle CPU. This is good for power-savings. * * * We don't do similar optimization for completely idle syste * We don't do similar optimization for completely idle syste * selecting an idle CPU will add more delays to the timers t * selecting an idle CPU will add more delays to the timers t * (as that CPU's timer base may not be uptodate wrt jiffies * (as that CPU's timer base may not be uptodate wrt jiffies */ */ int get_nohz_timer_target(void) int get_nohz_timer_target(void) { { int i, cpu = smp_processor_id(), default_cpu = -1; int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; struct sched_domain *sd; const struct cpumask *hk_mask; const struct cpumask *hk_mask; if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { if (!idle_cpu(cpu)) if (!idle_cpu(cpu)) return cpu; return cpu; default_cpu = cpu; default_cpu = cpu; } } hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); guard(rcu)(); guard(rcu)(); for_each_domain(cpu, sd) { for_each_domain(cpu, sd) { for_each_cpu_and(i, sched_domain_span(sd), hk for_each_cpu_and(i, sched_domain_span(sd), hk if (cpu == i) if (cpu == i) continue; continue; if (!idle_cpu(i)) if (!idle_cpu(i)) return i; return i; } } } } if (default_cpu == -1) if (default_cpu == -1) default_cpu = housekeeping_any_cpu(HK_TYPE_TI default_cpu = housekeeping_any_cpu(HK_TYPE_TI return default_cpu; return default_cpu; } } /* /* * When add_timer_on() enqueues a timer into the timer wheel * When add_timer_on() enqueues a timer into the timer wheel * idle CPU then this timer might expire before the next time * idle CPU then this timer might expire before the next time * which is scheduled to wake up that CPU. In case of a compl * which is scheduled to wake up that CPU. In case of a compl * idle system the next event might even be infinite time int * idle system the next event might even be infinite time int * future. wake_up_idle_cpu() ensures that the CPU is woken u * future. wake_up_idle_cpu() ensures that the CPU is woken u * leaves the inner idle loop so the newly added timer is tak * leaves the inner idle loop so the newly added timer is tak * account when the CPU goes back to idle and evaluates the t * account when the CPU goes back to idle and evaluates the t * wheel for the next timer event. * wheel for the next timer event. */ */ static void wake_up_idle_cpu(int cpu) static void wake_up_idle_cpu(int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); if (cpu == smp_processor_id()) if (cpu == smp_processor_id()) return; return; if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED | if (set_nr_and_not_polling(rq->idle)) smp_send_reschedule(cpu); smp_send_reschedule(cpu); else else trace_sched_wake_idle_without_ipi(cpu); trace_sched_wake_idle_without_ipi(cpu); } } static bool wake_up_full_nohz_cpu(int cpu) static bool wake_up_full_nohz_cpu(int cpu) { { /* /* * We just need the target to call irq_exit() and re- * We just need the target to call irq_exit() and re- * the next tick. The nohz full kick at least implies * the next tick. The nohz full kick at least implies * If needed we can still optimize that later with an * If needed we can still optimize that later with an * empty IRQ. * empty IRQ. */ */ if (cpu_is_offline(cpu)) if (cpu_is_offline(cpu)) return true; /* Don't try to wake offline CP return true; /* Don't try to wake offline CP if (tick_nohz_full_cpu(cpu)) { if (tick_nohz_full_cpu(cpu)) { if (cpu != smp_processor_id() || if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) tick_nohz_tick_stopped()) tick_nohz_full_kick_cpu(cpu); tick_nohz_full_kick_cpu(cpu); return true; return true; } } return false; return false; } } /* /* * Wake up the specified CPU. If the CPU is going offline, i * Wake up the specified CPU. If the CPU is going offline, i * caller's responsibility to deal with the lost wakeup, for * caller's responsibility to deal with the lost wakeup, for * by hooking into the CPU_DEAD notifier like timers and hrti * by hooking into the CPU_DEAD notifier like timers and hrti */ */ void wake_up_nohz_cpu(int cpu) void wake_up_nohz_cpu(int cpu) { { if (!wake_up_full_nohz_cpu(cpu)) if (!wake_up_full_nohz_cpu(cpu)) wake_up_idle_cpu(cpu); wake_up_idle_cpu(cpu); } } static void nohz_csd_func(void *info) static void nohz_csd_func(void *info) { { struct rq *rq = info; struct rq *rq = info; int cpu = cpu_of(rq); int cpu = cpu_of(rq); unsigned int flags; unsigned int flags; /* /* * Release the rq::nohz_csd. * Release the rq::nohz_csd. */ */ flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEW flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEW WARN_ON(!(flags & NOHZ_KICK_MASK)); WARN_ON(!(flags & NOHZ_KICK_MASK)); rq->idle_balance = idle_cpu(cpu); rq->idle_balance = idle_cpu(cpu); if (rq->idle_balance && !need_resched()) { if (rq->idle_balance && !need_resched()) { rq->nohz_idle_balance = flags; rq->nohz_idle_balance = flags; raise_softirq_irqoff(SCHED_SOFTIRQ); raise_softirq_irqoff(SCHED_SOFTIRQ); } } } } #endif /* CONFIG_NO_HZ_COMMON */ #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL static inline bool __need_bw_check(struct rq *rq, struct task static inline bool __need_bw_check(struct rq *rq, struct task { { if (rq->nr_running != 1) if (rq->nr_running != 1) return false; return false; if (p->sched_class != &fair_sched_class) if (p->sched_class != &fair_sched_class) return false; return false; if (!task_on_rq_queued(p)) if (!task_on_rq_queued(p)) return false; return false; return true; return true; } } bool sched_can_stop_tick(struct rq *rq) bool sched_can_stop_tick(struct rq *rq) { { int fifo_nr_running; int fifo_nr_running; /* Deadline tasks, even if single, need the tick */ /* Deadline tasks, even if single, need the tick */ if (rq->dl.dl_nr_running) if (rq->dl.dl_nr_running) return false; return false; /* /* * If there are more than one RR tasks, we need the t * If there are more than one RR tasks, we need the t * actual RR behaviour. * actual RR behaviour. */ */ if (rq->rt.rr_nr_running) { if (rq->rt.rr_nr_running) { if (rq->rt.rr_nr_running == 1) if (rq->rt.rr_nr_running == 1) return true; return true; else else return false; return false; } } /* /* * If there's no RR tasks, but FIFO tasks, we can ski * If there's no RR tasks, but FIFO tasks, we can ski * forced preemption between FIFO tasks. * forced preemption between FIFO tasks. */ */ fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr if (fifo_nr_running) if (fifo_nr_running) return true; return true; /* /* * If there are no DL,RR/FIFO tasks, there must only * If there are no DL,RR/FIFO tasks, there must only * if there's more than one we need the tick for invo * if there's more than one we need the tick for invo * preemption. * preemption. */ */ if (rq->nr_running > 1) if (rq->nr_running > 1) return false; return false; /* /* * If there is one task and it has CFS runtime bandwi * If there is one task and it has CFS runtime bandwi * and it's on the cpu now we don't want to stop the * and it's on the cpu now we don't want to stop the * This check prevents clearing the bit if a newly en * This check prevents clearing the bit if a newly en * dequeued by migrating while the constrained task c * dequeued by migrating while the constrained task c * E.g. going from 2->1 without going through pick_ne * E.g. going from 2->1 without going through pick_ne */ */ if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr if (cfs_task_bw_constrained(rq->curr)) if (cfs_task_bw_constrained(rq->curr)) return false; return false; } } return true; return true; } } #endif /* CONFIG_NO_HZ_FULL */ #endif /* CONFIG_NO_HZ_FULL */ #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GR #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GR (defined(CONFIG_SMP) || defined(CONFI (defined(CONFIG_SMP) || defined(CONFI /* /* * Iterate task_group tree rooted at *from, calling @down whe * Iterate task_group tree rooted at *from, calling @down whe * node and @up when leaving it for the final time. * node and @up when leaving it for the final time. * * * Caller must hold rcu_lock or sufficient equivalent. * Caller must hold rcu_lock or sufficient equivalent. */ */ int walk_tg_tree_from(struct task_group *from, int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up, tg_visitor down, tg_visitor up, { { struct task_group *parent, *child; struct task_group *parent, *child; int ret; int ret; parent = from; parent = from; down: down: ret = (*down)(parent, data); ret = (*down)(parent, data); if (ret) if (ret) goto out; goto out; list_for_each_entry_rcu(child, &parent->children, sib list_for_each_entry_rcu(child, &parent->children, sib parent = child; parent = child; goto down; goto down; up: up: continue; continue; } } ret = (*up)(parent, data); ret = (*up)(parent, data); if (ret || parent == from) if (ret || parent == from) goto out; goto out; child = parent; child = parent; parent = parent->parent; parent = parent->parent; if (parent) if (parent) goto up; goto up; out: out: return ret; return ret; } } int tg_nop(struct task_group *tg, void *data) int tg_nop(struct task_group *tg, void *data) { { return 0; return 0; } } #endif #endif static void set_load_weight(struct task_struct *p, bool updat static void set_load_weight(struct task_struct *p, bool updat { { int prio = p->static_prio - MAX_RT_PRIO; int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; struct load_weight *load = &p->se.load; /* /* * SCHED_IDLE tasks get minimal weight: * SCHED_IDLE tasks get minimal weight: */ */ if (task_has_idle_policy(p)) { if (task_has_idle_policy(p)) { load->weight = scale_load(WEIGHT_IDLEPRIO); load->weight = scale_load(WEIGHT_IDLEPRIO); load->inv_weight = WMULT_IDLEPRIO; load->inv_weight = WMULT_IDLEPRIO; return; return; } } /* /* * SCHED_OTHER tasks have to update their load when c * SCHED_OTHER tasks have to update their load when c * weight * weight */ */ if (update_load && p->sched_class == &fair_sched_clas if (update_load && p->sched_class == &fair_sched_clas reweight_task(p, prio); reweight_task(p, prio); } else { } else { load->weight = scale_load(sched_prio_to_weigh load->weight = scale_load(sched_prio_to_weigh load->inv_weight = sched_prio_to_wmult[prio]; load->inv_weight = sched_prio_to_wmult[prio]; } } } } #ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK /* /* * Serializes updates of utilization clamp values * Serializes updates of utilization clamp values * * * The (slow-path) user-space triggers utilization clamp valu * The (slow-path) user-space triggers utilization clamp valu * can require updates on (fast-path) scheduler's data struct * can require updates on (fast-path) scheduler's data struct * support enqueue/dequeue operations. * support enqueue/dequeue operations. * While the per-CPU rq lock protects fast-path update operat * While the per-CPU rq lock protects fast-path update operat * requests are serialized using a mutex to reduce the risk o * requests are serialized using a mutex to reduce the risk o * updates or API abuses. * updates or API abuses. */ */ static DEFINE_MUTEX(uclamp_mutex); static DEFINE_MUTEX(uclamp_mutex); /* Max allowed minimum utilization */ /* Max allowed minimum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_m static unsigned int __maybe_unused sysctl_sched_uclamp_util_m /* Max allowed maximum utilization */ /* Max allowed maximum utilization */ static unsigned int __maybe_unused sysctl_sched_uclamp_util_m static unsigned int __maybe_unused sysctl_sched_uclamp_util_m /* /* * By default RT tasks run at the maximum performance point/c * By default RT tasks run at the maximum performance point/c * system. Uclamp enforces this by always setting UCLAMP_MIN * system. Uclamp enforces this by always setting UCLAMP_MIN * SCHED_CAPACITY_SCALE. * SCHED_CAPACITY_SCALE. * * * This knob allows admins to change the default behavior whe * This knob allows admins to change the default behavior whe * used. In battery powered devices, particularly, running at * used. In battery powered devices, particularly, running at * capacity and frequency will increase energy consumption an * capacity and frequency will increase energy consumption an * battery life. * battery life. * * * This knob only affects RT tasks that their uclamp_se->user * This knob only affects RT tasks that their uclamp_se->user * * * This knob will not override the system default sched_util_ * This knob will not override the system default sched_util_ * above. * above. */ */ static unsigned int sysctl_sched_uclamp_util_min_rt_default = static unsigned int sysctl_sched_uclamp_util_min_rt_default = /* All clamps are required to be less or equal than these val /* All clamps are required to be less or equal than these val static struct uclamp_se uclamp_default[UCLAMP_CNT]; static struct uclamp_se uclamp_default[UCLAMP_CNT]; /* /* * This static key is used to reduce the uclamp overhead in t * This static key is used to reduce the uclamp overhead in t * primarily disables the call to uclamp_rq_{inc, dec}() in * primarily disables the call to uclamp_rq_{inc, dec}() in * enqueue/dequeue_task(). * enqueue/dequeue_task(). * * * This allows users to continue to enable uclamp in their ke * This allows users to continue to enable uclamp in their ke * minimum uclamp overhead in the fast path. * minimum uclamp overhead in the fast path. * * * As soon as userspace modifies any of the uclamp knobs, the * As soon as userspace modifies any of the uclamp knobs, the * enabled, since we have an actual users that make use of uc * enabled, since we have an actual users that make use of uc * functionality. * functionality. * * * The knobs that would enable this static key are: * The knobs that would enable this static key are: * * * * A task modifying its uclamp value with sched_setattr() * * A task modifying its uclamp value with sched_setattr() * * An admin modifying the sysctl_sched_uclamp_{min, max} * * An admin modifying the sysctl_sched_uclamp_{min, max} * * An admin modifying the cgroup cpu.uclamp.{min, max} * * An admin modifying the cgroup cpu.uclamp.{min, max} */ */ DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); DEFINE_STATIC_KEY_FALSE(sched_uclamp_used); /* Integer rounded range for each bucket */ /* Integer rounded range for each bucket */ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_ #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_ #define for_each_clamp_id(clamp_id) \ #define for_each_clamp_id(clamp_id) \ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_ for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_ static inline unsigned int uclamp_bucket_id(unsigned int clam static inline unsigned int uclamp_bucket_id(unsigned int clam { { return min_t(unsigned int, clamp_value / UCLAMP_BUCKE return min_t(unsigned int, clamp_value / UCLAMP_BUCKE } } static inline unsigned int uclamp_none(enum uclamp_id clamp_i static inline unsigned int uclamp_none(enum uclamp_id clamp_i { { if (clamp_id == UCLAMP_MIN) if (clamp_id == UCLAMP_MIN) return 0; return 0; return SCHED_CAPACITY_SCALE; return SCHED_CAPACITY_SCALE; } } static inline void uclamp_se_set(struct uclamp_se *uc_se, static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value, bool use unsigned int value, bool use { { uc_se->value = value; uc_se->value = value; uc_se->bucket_id = uclamp_bucket_id(value); uc_se->bucket_id = uclamp_bucket_id(value); uc_se->user_defined = user_defined; uc_se->user_defined = user_defined; } } static inline unsigned int static inline unsigned int uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) unsigned int clamp_value) { { /* /* * Avoid blocked utilization pushing up the frequency * Avoid blocked utilization pushing up the frequency * idle (which drops the max-clamp) by retaining the * idle (which drops the max-clamp) by retaining the * max-clamp. * max-clamp. */ */ if (clamp_id == UCLAMP_MAX) { if (clamp_id == UCLAMP_MAX) { rq->uclamp_flags |= UCLAMP_FLAG_IDLE; rq->uclamp_flags |= UCLAMP_FLAG_IDLE; return clamp_value; return clamp_value; } } return uclamp_none(UCLAMP_MIN); return uclamp_none(UCLAMP_MIN); } } static inline void uclamp_idle_reset(struct rq *rq, enum ucla static inline void uclamp_idle_reset(struct rq *rq, enum ucla unsigned int clamp_value unsigned int clamp_value { { /* Reset max-clamp retention only on idle exit */ /* Reset max-clamp retention only on idle exit */ if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) return; return; uclamp_rq_set(rq, clamp_id, clamp_value); uclamp_rq_set(rq, clamp_id, clamp_value); } } static inline static inline unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_i unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_i unsigned int clamp_value) unsigned int clamp_value) { { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].b struct uclamp_bucket *bucket = rq->uclamp[clamp_id].b int bucket_id = UCLAMP_BUCKETS - 1; int bucket_id = UCLAMP_BUCKETS - 1; /* /* * Since both min and max clamps are max aggregated, * Since both min and max clamps are max aggregated, * top most bucket with tasks in. * top most bucket with tasks in. */ */ for ( ; bucket_id >= 0; bucket_id--) { for ( ; bucket_id >= 0; bucket_id--) { if (!bucket[bucket_id].tasks) if (!bucket[bucket_id].tasks) continue; continue; return bucket[bucket_id].value; return bucket[bucket_id].value; } } /* No tasks -- default clamp values */ /* No tasks -- default clamp values */ return uclamp_idle_value(rq, clamp_id, clamp_value); return uclamp_idle_value(rq, clamp_id, clamp_value); } } static void __uclamp_update_util_min_rt_default(struct task_s static void __uclamp_update_util_min_rt_default(struct task_s { { unsigned int default_util_min; unsigned int default_util_min; struct uclamp_se *uc_se; struct uclamp_se *uc_se; lockdep_assert_held(&p->pi_lock); lockdep_assert_held(&p->pi_lock); uc_se = &p->uclamp_req[UCLAMP_MIN]; uc_se = &p->uclamp_req[UCLAMP_MIN]; /* Only sync if user didn't override the default */ /* Only sync if user didn't override the default */ if (uc_se->user_defined) if (uc_se->user_defined) return; return; default_util_min = sysctl_sched_uclamp_util_min_rt_de default_util_min = sysctl_sched_uclamp_util_min_rt_de uclamp_se_set(uc_se, default_util_min, false); uclamp_se_set(uc_se, default_util_min, false); } } static void uclamp_update_util_min_rt_default(struct task_str static void uclamp_update_util_min_rt_default(struct task_str { { struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; if (!rt_task(p)) if (!rt_task(p)) return; return; /* Protect updates to p->uclamp_* */ /* Protect updates to p->uclamp_* */ rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); __uclamp_update_util_min_rt_default(p); __uclamp_update_util_min_rt_default(p); task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); } } static inline struct uclamp_se static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clam uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clam { { /* Copy by value as we could modify it */ /* Copy by value as we could modify it */ struct uclamp_se uc_req = p->uclamp_req[clamp_id]; struct uclamp_se uc_req = p->uclamp_req[clamp_id]; #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP unsigned int tg_min, tg_max, value; unsigned int tg_min, tg_max, value; /* /* * Tasks in autogroups or root task group will be * Tasks in autogroups or root task group will be * restricted by system defaults. * restricted by system defaults. */ */ if (task_group_is_autogroup(task_group(p))) if (task_group_is_autogroup(task_group(p))) return uc_req; return uc_req; if (task_group(p) == &root_task_group) if (task_group(p) == &root_task_group) return uc_req; return uc_req; tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; tg_min = task_group(p)->uclamp[UCLAMP_MIN].value; tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; tg_max = task_group(p)->uclamp[UCLAMP_MAX].value; value = uc_req.value; value = uc_req.value; value = clamp(value, tg_min, tg_max); value = clamp(value, tg_min, tg_max); uclamp_se_set(&uc_req, value, false); uclamp_se_set(&uc_req, value, false); #endif #endif return uc_req; return uc_req; } } /* /* * The effective clamp bucket index of a task depends on, by * The effective clamp bucket index of a task depends on, by * priority: * priority: * - the task specific clamp value, when explicitly requested * - the task specific clamp value, when explicitly requested * - the task group effective clamp value, for tasks not eith * - the task group effective clamp value, for tasks not eith * group or in an autogroup * group or in an autogroup * - the system default clamp value, defined by the sysadmin * - the system default clamp value, defined by the sysadmin */ */ static inline struct uclamp_se static inline struct uclamp_se uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id { { struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp struct uclamp_se uc_max = uclamp_default[clamp_id]; struct uclamp_se uc_max = uclamp_default[clamp_id]; /* System default restrictions always apply */ /* System default restrictions always apply */ if (unlikely(uc_req.value > uc_max.value)) if (unlikely(uc_req.value > uc_max.value)) return uc_max; return uc_max; return uc_req; return uc_req; } } unsigned long uclamp_eff_value(struct task_struct *p, enum uc unsigned long uclamp_eff_value(struct task_struct *p, enum uc { { struct uclamp_se uc_eff; struct uclamp_se uc_eff; /* Task currently refcounted: use back-annotated (eff /* Task currently refcounted: use back-annotated (eff if (p->uclamp[clamp_id].active) if (p->uclamp[clamp_id].active) return (unsigned long)p->uclamp[clamp_id].val return (unsigned long)p->uclamp[clamp_id].val uc_eff = uclamp_eff_get(p, clamp_id); uc_eff = uclamp_eff_get(p, clamp_id); return (unsigned long)uc_eff.value; return (unsigned long)uc_eff.value; } } /* /* * When a task is enqueued on a rq, the clamp bucket currentl * When a task is enqueued on a rq, the clamp bucket currentl * task's uclamp::bucket_id is refcounted on that rq. This al * task's uclamp::bucket_id is refcounted on that rq. This al * updates the rq's clamp value if required. * updates the rq's clamp value if required. * * * Tasks can have a task-specific value requested from user-s * Tasks can have a task-specific value requested from user-s * within each bucket the maximum value for tasks refcounted * within each bucket the maximum value for tasks refcounted * This "local max aggregation" allows to track the exact "re * This "local max aggregation" allows to track the exact "re * for each bucket when all its RUNNABLE tasks require the sa * for each bucket when all its RUNNABLE tasks require the sa */ */ static inline void uclamp_rq_inc_id(struct rq *rq, struct tas static inline void uclamp_rq_inc_id(struct rq *rq, struct tas enum uclamp_id clamp_id) enum uclamp_id clamp_id) { { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; struct uclamp_bucket *bucket; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); /* Update task effective clamp */ /* Update task effective clamp */ p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket->tasks++; bucket->tasks++; uc_se->active = true; uc_se->active = true; uclamp_idle_reset(rq, clamp_id, uc_se->value); uclamp_idle_reset(rq, clamp_id, uc_se->value); /* /* * Local max aggregation: rq buckets always track the * Local max aggregation: rq buckets always track the * "requested" clamp value of its RUNNABLE tasks. * "requested" clamp value of its RUNNABLE tasks. */ */ if (bucket->tasks == 1 || uc_se->value > bucket->valu if (bucket->tasks == 1 || uc_se->value > bucket->valu bucket->value = uc_se->value; bucket->value = uc_se->value; if (uc_se->value > uclamp_rq_get(rq, clamp_id)) if (uc_se->value > uclamp_rq_get(rq, clamp_id)) uclamp_rq_set(rq, clamp_id, uc_se->value); uclamp_rq_set(rq, clamp_id, uc_se->value); } } /* /* * When a task is dequeued from a rq, the clamp bucket refcou * When a task is dequeued from a rq, the clamp bucket refcou * is released. If this is the last task reference counting t * is released. If this is the last task reference counting t * active clamp value, then the rq's clamp value is updated. * active clamp value, then the rq's clamp value is updated. * * * Both refcounted tasks and rq's cached clamp values are exp * Both refcounted tasks and rq's cached clamp values are exp * always valid. If it's detected they are not, as defensive * always valid. If it's detected they are not, as defensive * enforce the expected state and warn. * enforce the expected state and warn. */ */ static inline void uclamp_rq_dec_id(struct rq *rq, struct tas static inline void uclamp_rq_dec_id(struct rq *rq, struct tas enum uclamp_id clamp_id) enum uclamp_id clamp_id) { { struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_se *uc_se = &p->uclamp[clamp_id]; struct uclamp_bucket *bucket; struct uclamp_bucket *bucket; unsigned int bkt_clamp; unsigned int bkt_clamp; unsigned int rq_clamp; unsigned int rq_clamp; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); /* /* * If sched_uclamp_used was enabled after task @p was * If sched_uclamp_used was enabled after task @p was * we could end up with unbalanced call to uclamp_rq_ * we could end up with unbalanced call to uclamp_rq_ * * * In this case the uc_se->active flag should be fals * In this case the uc_se->active flag should be fals * accounting was performed at enqueue time and we ca * accounting was performed at enqueue time and we ca * here. * here. * * * Need to be careful of the following enqueue/dequeu * Need to be careful of the following enqueue/dequeu * problem too * problem too * * * enqueue(taskA) * enqueue(taskA) * // sched_uclamp_used gets enabled * // sched_uclamp_used gets enabled * enqueue(taskB) * enqueue(taskB) * dequeue(taskA) * dequeue(taskA) * // Must not decrement bucket->tasks here * // Must not decrement bucket->tasks here * dequeue(taskB) * dequeue(taskB) * * * where we could end up with stale data in uc_se and * where we could end up with stale data in uc_se and * bucket[uc_se->bucket_id]. * bucket[uc_se->bucket_id]. * * * The following check here eliminates the possibilit * The following check here eliminates the possibilit */ */ if (unlikely(!uc_se->active)) if (unlikely(!uc_se->active)) return; return; bucket = &uc_rq->bucket[uc_se->bucket_id]; bucket = &uc_rq->bucket[uc_se->bucket_id]; SCHED_WARN_ON(!bucket->tasks); SCHED_WARN_ON(!bucket->tasks); if (likely(bucket->tasks)) if (likely(bucket->tasks)) bucket->tasks--; bucket->tasks--; uc_se->active = false; uc_se->active = false; /* /* * Keep "local max aggregation" simple and accept to * Keep "local max aggregation" simple and accept to * overboost some RUNNABLE tasks in the same bucket. * overboost some RUNNABLE tasks in the same bucket. * The rq clamp bucket value is reset to its base val * The rq clamp bucket value is reset to its base val * there are no more RUNNABLE tasks refcounting it. * there are no more RUNNABLE tasks refcounting it. */ */ if (likely(bucket->tasks)) if (likely(bucket->tasks)) return; return; rq_clamp = uclamp_rq_get(rq, clamp_id); rq_clamp = uclamp_rq_get(rq, clamp_id); /* /* * Defensive programming: this should never happen. I * Defensive programming: this should never happen. I * e.g. due to future modification, warn and fixup th * e.g. due to future modification, warn and fixup th */ */ SCHED_WARN_ON(bucket->value > rq_clamp); SCHED_WARN_ON(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uclamp_rq_set(rq, clamp_id, bkt_clamp); uclamp_rq_set(rq, clamp_id, bkt_clamp); } } } } static inline void uclamp_rq_inc(struct rq *rq, struct task_s static inline void uclamp_rq_inc(struct rq *rq, struct task_s { { enum uclamp_id clamp_id; enum uclamp_id clamp_id; /* /* * Avoid any overhead until uclamp is actually used b * Avoid any overhead until uclamp is actually used b * * * The condition is constructed such that a NOP is ge * The condition is constructed such that a NOP is ge * sched_uclamp_used is disabled. * sched_uclamp_used is disabled. */ */ if (!static_branch_unlikely(&sched_uclamp_used)) if (!static_branch_unlikely(&sched_uclamp_used)) return; return; if (unlikely(!p->sched_class->uclamp_enabled)) if (unlikely(!p->sched_class->uclamp_enabled)) return; return; for_each_clamp_id(clamp_id) for_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id); uclamp_rq_inc_id(rq, p, clamp_id); /* Reset clamp idle holding when there is one RUNNABL /* Reset clamp idle holding when there is one RUNNABL if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; } } static inline void uclamp_rq_dec(struct rq *rq, struct task_s static inline void uclamp_rq_dec(struct rq *rq, struct task_s { { enum uclamp_id clamp_id; enum uclamp_id clamp_id; /* /* * Avoid any overhead until uclamp is actually used b * Avoid any overhead until uclamp is actually used b * * * The condition is constructed such that a NOP is ge * The condition is constructed such that a NOP is ge * sched_uclamp_used is disabled. * sched_uclamp_used is disabled. */ */ if (!static_branch_unlikely(&sched_uclamp_used)) if (!static_branch_unlikely(&sched_uclamp_used)) return; return; if (unlikely(!p->sched_class->uclamp_enabled)) if (unlikely(!p->sched_class->uclamp_enabled)) return; return; for_each_clamp_id(clamp_id) for_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id); uclamp_rq_dec_id(rq, p, clamp_id); } } static inline void uclamp_rq_reinc_id(struct rq *rq, struct t static inline void uclamp_rq_reinc_id(struct rq *rq, struct t enum uclamp_id clamp_id enum uclamp_id clamp_id { { if (!p->uclamp[clamp_id].active) if (!p->uclamp[clamp_id].active) return; return; uclamp_rq_dec_id(rq, p, clamp_id); uclamp_rq_dec_id(rq, p, clamp_id); uclamp_rq_inc_id(rq, p, clamp_id); uclamp_rq_inc_id(rq, p, clamp_id); /* /* * Make sure to clear the idle flag if we've transien * Make sure to clear the idle flag if we've transien * active tasks on rq. * active tasks on rq. */ */ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCL if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCL rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; } } static inline void static inline void uclamp_update_active(struct task_struct *p) uclamp_update_active(struct task_struct *p) { { enum uclamp_id clamp_id; enum uclamp_id clamp_id; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; /* /* * Lock the task and the rq where the task is (or was * Lock the task and the rq where the task is (or was * * * We might lock the (previous) rq of a !RUNNABLE tas * We might lock the (previous) rq of a !RUNNABLE tas * price to pay to safely serialize util_{min,max} up * price to pay to safely serialize util_{min,max} up * enqueues, dequeues and migration operations. * enqueues, dequeues and migration operations. * This is the same locking schema used by __set_cpus * This is the same locking schema used by __set_cpus */ */ rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); /* /* * Setting the clamp bucket is serialized by task_rq_ * Setting the clamp bucket is serialized by task_rq_ * If the task is not yet RUNNABLE and its task_struc * If the task is not yet RUNNABLE and its task_struc * affecting a valid clamp bucket, the next time it's * affecting a valid clamp bucket, the next time it's * it will already see the updated clamp bucket value * it will already see the updated clamp bucket value */ */ for_each_clamp_id(clamp_id) for_each_clamp_id(clamp_id) uclamp_rq_reinc_id(rq, p, clamp_id); uclamp_rq_reinc_id(rq, p, clamp_id); task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); } } #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP static inline void static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css) uclamp_update_active_tasks(struct cgroup_subsys_state *css) { { struct css_task_iter it; struct css_task_iter it; struct task_struct *p; struct task_struct *p; css_task_iter_start(css, 0, &it); css_task_iter_start(css, 0, &it); while ((p = css_task_iter_next(&it))) while ((p = css_task_iter_next(&it))) uclamp_update_active(p); uclamp_update_active(p); css_task_iter_end(&it); css_task_iter_end(&it); } } static void cpu_util_update_eff(struct cgroup_subsys_state *c static void cpu_util_update_eff(struct cgroup_subsys_state *c #endif #endif #ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL #ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP static void uclamp_update_root_tg(void) static void uclamp_update_root_tg(void) { { struct task_group *tg = &root_task_group; struct task_group *tg = &root_task_group; uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], sysctl_sched_uclamp_util_min, false); sysctl_sched_uclamp_util_min, false); uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); sysctl_sched_uclamp_util_max, false); rcu_read_lock(); rcu_read_lock(); cpu_util_update_eff(&root_task_group.css); cpu_util_update_eff(&root_task_group.css); rcu_read_unlock(); rcu_read_unlock(); } } #else #else static void uclamp_update_root_tg(void) { } static void uclamp_update_root_tg(void) { } #endif #endif static void uclamp_sync_util_min_rt_default(void) static void uclamp_sync_util_min_rt_default(void) { { struct task_struct *g, *p; struct task_struct *g, *p; /* /* * copy_process() sysctl_uclamp * copy_process() sysctl_uclamp * uclamp_min_ * uclamp_min_ * write_lock(&tasklist_lock) read_lock(& * write_lock(&tasklist_lock) read_lock(& * // link thread smp_mb__aft * // link thread smp_mb__aft * write_unlock(&tasklist_lock) read_unlock * write_unlock(&tasklist_lock) read_unlock * sched_post_fork() for_each_pr * sched_post_fork() for_each_pr * __uclamp_sync_rt() __uclamp_ * __uclamp_sync_rt() __uclamp_ * * * Ensures that either sched_post_fork() will observe * Ensures that either sched_post_fork() will observe * uclamp_min_rt or for_each_process_thread() will ob * uclamp_min_rt or for_each_process_thread() will ob * task. * task. */ */ read_lock(&tasklist_lock); read_lock(&tasklist_lock); smp_mb__after_spinlock(); smp_mb__after_spinlock(); read_unlock(&tasklist_lock); read_unlock(&tasklist_lock); rcu_read_lock(); rcu_read_lock(); for_each_process_thread(g, p) for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p); uclamp_update_util_min_rt_default(p); rcu_read_unlock(); rcu_read_unlock(); } } static int sysctl_sched_uclamp_handler(struct ctl_table *tabl static int sysctl_sched_uclamp_handler(struct ctl_table *tabl void *buffer, size_t *lenp, l void *buffer, size_t *lenp, l { { bool update_root_tg = false; bool update_root_tg = false; int old_min, old_max, old_min_rt; int old_min, old_max, old_min_rt; int result; int result; guard(mutex)(&uclamp_mutex); guard(mutex)(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; old_max = sysctl_sched_uclamp_util_max; old_min_rt = sysctl_sched_uclamp_util_min_rt_default; old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, pp result = proc_dointvec(table, write, buffer, lenp, pp if (result) if (result) goto undo; goto undo; if (!write) if (!write) return 0; return 0; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclam if (sysctl_sched_uclamp_util_min > sysctl_sched_uclam sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCA sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCA sysctl_sched_uclamp_util_min_rt_default > SCHED_C sysctl_sched_uclamp_util_min_rt_default > SCHED_C result = -EINVAL; result = -EINVAL; goto undo; goto undo; } } if (old_min != sysctl_sched_uclamp_util_min) { if (old_min != sysctl_sched_uclamp_util_min) { uclamp_se_set(&uclamp_default[UCLAMP_MIN], uclamp_se_set(&uclamp_default[UCLAMP_MIN], sysctl_sched_uclamp_util_min, f sysctl_sched_uclamp_util_min, f update_root_tg = true; update_root_tg = true; } } if (old_max != sysctl_sched_uclamp_util_max) { if (old_max != sysctl_sched_uclamp_util_max) { uclamp_se_set(&uclamp_default[UCLAMP_MAX], uclamp_se_set(&uclamp_default[UCLAMP_MAX], sysctl_sched_uclamp_util_max, f sysctl_sched_uclamp_util_max, f update_root_tg = true; update_root_tg = true; } } if (update_root_tg) { if (update_root_tg) { static_branch_enable(&sched_uclamp_used); static_branch_enable(&sched_uclamp_used); uclamp_update_root_tg(); uclamp_update_root_tg(); } } if (old_min_rt != sysctl_sched_uclamp_util_min_rt_def if (old_min_rt != sysctl_sched_uclamp_util_min_rt_def static_branch_enable(&sched_uclamp_used); static_branch_enable(&sched_uclamp_used); uclamp_sync_util_min_rt_default(); uclamp_sync_util_min_rt_default(); } } /* /* * We update all RUNNABLE tasks only when task groups * We update all RUNNABLE tasks only when task groups * Otherwise, keep it simple and do just a lazy updat * Otherwise, keep it simple and do just a lazy updat * task enqueue time. * task enqueue time. */ */ return 0; return 0; undo: undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; sysctl_sched_uclamp_util_max = old_max; sysctl_sched_uclamp_util_min_rt_default = old_min_rt; sysctl_sched_uclamp_util_min_rt_default = old_min_rt; return result; return result; } } #endif #endif #endif #endif static int uclamp_validate(struct task_struct *p, static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) const struct sched_attr *attr) { { int util_min = p->uclamp_req[UCLAMP_MIN].value; int util_min = p->uclamp_req[UCLAMP_MIN].value; int util_max = p->uclamp_req[UCLAMP_MAX].value; int util_max = p->uclamp_req[UCLAMP_MAX].value; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { util_min = attr->sched_util_min; util_min = attr->sched_util_min; if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) return -EINVAL; return -EINVAL; } } if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { util_max = attr->sched_util_max; util_max = attr->sched_util_max; if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) return -EINVAL; return -EINVAL; } } if (util_min != -1 && util_max != -1 && util_min > ut if (util_min != -1 && util_max != -1 && util_min > ut return -EINVAL; return -EINVAL; /* /* * We have valid uclamp attributes; make sure uclamp * We have valid uclamp attributes; make sure uclamp * * * We need to do that here, because enabling static b * We need to do that here, because enabling static b * blocking operation which obviously cannot be done * blocking operation which obviously cannot be done * scheduler locks. * scheduler locks. */ */ static_branch_enable(&sched_uclamp_used); static_branch_enable(&sched_uclamp_used); return 0; return 0; } } static bool uclamp_reset(const struct sched_attr *attr, static bool uclamp_reset(const struct sched_attr *attr, enum uclamp_id clamp_id, enum uclamp_id clamp_id, struct uclamp_se *uc_se) struct uclamp_se *uc_se) { { /* Reset on sched class change for a non user-defined /* Reset on sched class change for a non user-defined if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM !uc_se->user_defined) !uc_se->user_defined) return true; return true; /* Reset on sched_util_{min,max} == -1. */ /* Reset on sched_util_{min,max} == -1. */ if (clamp_id == UCLAMP_MIN && if (clamp_id == UCLAMP_MIN && attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && attr->sched_util_min == -1) { attr->sched_util_min == -1) { return true; return true; } } if (clamp_id == UCLAMP_MAX && if (clamp_id == UCLAMP_MAX && attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && attr->sched_util_max == -1) { attr->sched_util_max == -1) { return true; return true; } } return false; return false; } } static void __setscheduler_uclamp(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *at const struct sched_attr *at { { enum uclamp_id clamp_id; enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clam struct uclamp_se *uc_se = &p->uclamp_req[clam unsigned int value; unsigned int value; if (!uclamp_reset(attr, clamp_id, uc_se)) if (!uclamp_reset(attr, clamp_id, uc_se)) continue; continue; /* /* * RT by default have a 100% boost value that * RT by default have a 100% boost value that * at runtime. * at runtime. */ */ if (unlikely(rt_task(p) && clamp_id == UCLAMP if (unlikely(rt_task(p) && clamp_id == UCLAMP value = sysctl_sched_uclamp_util_min_ value = sysctl_sched_uclamp_util_min_ else else value = uclamp_none(clamp_id); value = uclamp_none(clamp_id); uclamp_se_set(uc_se, value, false); uclamp_se_set(uc_se, value, false); } } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM return; return; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && attr->sched_util_min != -1) { attr->sched_util_min != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); attr->sched_util_min, true); } } if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && attr->sched_util_max != -1) { attr->sched_util_max != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); attr->sched_util_max, true); } } } } static void uclamp_fork(struct task_struct *p) static void uclamp_fork(struct task_struct *p) { { enum uclamp_id clamp_id; enum uclamp_id clamp_id; /* /* * We don't need to hold task_rq_lock() when updating * We don't need to hold task_rq_lock() when updating * as the task is still at its early fork stages. * as the task is still at its early fork stages. */ */ for_each_clamp_id(clamp_id) for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; p->uclamp[clamp_id].active = false; if (likely(!p->sched_reset_on_fork)) if (likely(!p->sched_reset_on_fork)) return; return; for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { uclamp_se_set(&p->uclamp_req[clamp_id], uclamp_se_set(&p->uclamp_req[clamp_id], uclamp_none(clamp_id), false); uclamp_none(clamp_id), false); } } } } static void uclamp_post_fork(struct task_struct *p) static void uclamp_post_fork(struct task_struct *p) { { uclamp_update_util_min_rt_default(p); uclamp_update_util_min_rt_default(p); } } static void __init init_uclamp_rq(struct rq *rq) static void __init init_uclamp_rq(struct rq *rq) { { enum uclamp_id clamp_id; enum uclamp_id clamp_id; struct uclamp_rq *uc_rq = rq->uclamp; struct uclamp_rq *uc_rq = rq->uclamp; for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { uc_rq[clamp_id] = (struct uclamp_rq) { uc_rq[clamp_id] = (struct uclamp_rq) { .value = uclamp_none(clamp_id) .value = uclamp_none(clamp_id) }; }; } } rq->uclamp_flags = UCLAMP_FLAG_IDLE; rq->uclamp_flags = UCLAMP_FLAG_IDLE; } } static void __init init_uclamp(void) static void __init init_uclamp(void) { { struct uclamp_se uc_max = {}; struct uclamp_se uc_max = {}; enum uclamp_id clamp_id; enum uclamp_id clamp_id; int cpu; int cpu; for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) init_uclamp_rq(cpu_rq(cpu)); init_uclamp_rq(cpu_rq(cpu)); for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { uclamp_se_set(&init_task.uclamp_req[clamp_id] uclamp_se_set(&init_task.uclamp_req[clamp_id] uclamp_none(clamp_id), false); uclamp_none(clamp_id), false); } } /* System defaults allow max clamp values for both in /* System defaults allow max clamp values for both in uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { uclamp_default[clamp_id] = uc_max; uclamp_default[clamp_id] = uc_max; #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP root_task_group.uclamp_req[clamp_id] = uc_max root_task_group.uclamp_req[clamp_id] = uc_max root_task_group.uclamp[clamp_id] = uc_max; root_task_group.uclamp[clamp_id] = uc_max; #endif #endif } } } } #else /* CONFIG_UCLAMP_TASK */ #else /* CONFIG_UCLAMP_TASK */ static inline void uclamp_rq_inc(struct rq *rq, struct task_s static inline void uclamp_rq_inc(struct rq *rq, struct task_s static inline void uclamp_rq_dec(struct rq *rq, struct task_s static inline void uclamp_rq_dec(struct rq *rq, struct task_s static inline int uclamp_validate(struct task_struct *p, static inline int uclamp_validate(struct task_struct *p, const struct sched_attr *at const struct sched_attr *at { { return -EOPNOTSUPP; return -EOPNOTSUPP; } } static void __setscheduler_uclamp(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *at const struct sched_attr *at static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_fork(struct task_struct *p) { } static inline void uclamp_post_fork(struct task_struct *p) { static inline void uclamp_post_fork(struct task_struct *p) { static inline void init_uclamp(void) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ #endif /* CONFIG_UCLAMP_TASK */ bool sched_task_on_rq(struct task_struct *p) bool sched_task_on_rq(struct task_struct *p) { { return task_on_rq_queued(p); return task_on_rq_queued(p); } } unsigned long get_wchan(struct task_struct *p) unsigned long get_wchan(struct task_struct *p) { { unsigned long ip = 0; unsigned long ip = 0; unsigned int state; unsigned int state; if (!p || p == current) if (!p || p == current) return 0; return 0; /* Only get wchan if task is blocked and we can keep /* Only get wchan if task is blocked and we can keep raw_spin_lock_irq(&p->pi_lock); raw_spin_lock_irq(&p->pi_lock); state = READ_ONCE(p->__state); state = READ_ONCE(p->__state); smp_rmb(); /* see try_to_wake_up() */ smp_rmb(); /* see try_to_wake_up() */ if (state != TASK_RUNNING && state != TASK_WAKING && if (state != TASK_RUNNING && state != TASK_WAKING && ip = __get_wchan(p); ip = __get_wchan(p); raw_spin_unlock_irq(&p->pi_lock); raw_spin_unlock_irq(&p->pi_lock); return ip; return ip; } } static inline void enqueue_task(struct rq *rq, struct task_st static inline void enqueue_task(struct rq *rq, struct task_st { { if (!(flags & ENQUEUE_NOCLOCK)) if (!(flags & ENQUEUE_NOCLOCK)) update_rq_clock(rq); update_rq_clock(rq); if (!(flags & ENQUEUE_RESTORE)) { if (!(flags & ENQUEUE_RESTORE)) { sched_info_enqueue(rq, p); sched_info_enqueue(rq, p); psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !( psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !( } } uclamp_rq_inc(rq, p); uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); p->sched_class->enqueue_task(rq, p, flags); if (sched_core_enabled(rq)) if (sched_core_enabled(rq)) sched_core_enqueue(rq, p); sched_core_enqueue(rq, p); } } static inline void dequeue_task(struct rq *rq, struct task_st static inline void dequeue_task(struct rq *rq, struct task_st { { if (sched_core_enabled(rq)) if (sched_core_enabled(rq)) sched_core_dequeue(rq, p, flags); sched_core_dequeue(rq, p, flags); if (!(flags & DEQUEUE_NOCLOCK)) if (!(flags & DEQUEUE_NOCLOCK)) update_rq_clock(rq); update_rq_clock(rq); if (!(flags & DEQUEUE_SAVE)) { if (!(flags & DEQUEUE_SAVE)) { sched_info_dequeue(rq, p); sched_info_dequeue(rq, p); psi_dequeue(p, flags & DEQUEUE_SLEEP); psi_dequeue(p, flags & DEQUEUE_SLEEP); } } uclamp_rq_dec(rq, p); uclamp_rq_dec(rq, p); p->sched_class->dequeue_task(rq, p, flags); p->sched_class->dequeue_task(rq, p, flags); } } void activate_task(struct rq *rq, struct task_struct *p, int void activate_task(struct rq *rq, struct task_struct *p, int { { if (task_on_rq_migrating(p)) if (task_on_rq_migrating(p)) flags |= ENQUEUE_MIGRATED; flags |= ENQUEUE_MIGRATED; if (flags & ENQUEUE_MIGRATED) if (flags & ENQUEUE_MIGRATED) sched_mm_cid_migrate_to(rq, p); sched_mm_cid_migrate_to(rq, p); enqueue_task(rq, p, flags); enqueue_task(rq, p, flags); p->on_rq = TASK_ON_RQ_QUEUED; p->on_rq = TASK_ON_RQ_QUEUED; } } void deactivate_task(struct rq *rq, struct task_struct *p, in void deactivate_task(struct rq *rq, struct task_struct *p, in { { p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_M p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_M dequeue_task(rq, p, flags); dequeue_task(rq, p, flags); } } static inline int __normal_prio(int policy, int rt_prio, int static inline int __normal_prio(int policy, int rt_prio, int { { int prio; int prio; if (dl_policy(policy)) if (dl_policy(policy)) prio = MAX_DL_PRIO - 1; prio = MAX_DL_PRIO - 1; else if (rt_policy(policy)) else if (rt_policy(policy)) prio = MAX_RT_PRIO - 1 - rt_prio; prio = MAX_RT_PRIO - 1 - rt_prio; else else prio = NICE_TO_PRIO(nice); prio = NICE_TO_PRIO(nice); return prio; return prio; } } /* /* * Calculate the expected normal priority: i.e. priority * Calculate the expected normal priority: i.e. priority * without taking RT-inheritance into account. Might be * without taking RT-inheritance into account. Might be * boosted by interactivity modifiers. Changes upon fork, * boosted by interactivity modifiers. Changes upon fork, * setprio syscalls, and whenever the interactivity * setprio syscalls, and whenever the interactivity * estimator recalculates. * estimator recalculates. */ */ static inline int normal_prio(struct task_struct *p) static inline int normal_prio(struct task_struct *p) { { return __normal_prio(p->policy, p->rt_priority, PRIO_ return __normal_prio(p->policy, p->rt_priority, PRIO_ } } /* /* * Calculate the current priority, i.e. the priority * Calculate the current priority, i.e. the priority * taken into account by the scheduler. This value might * taken into account by the scheduler. This value might * be boosted by RT tasks, or might be boosted by * be boosted by RT tasks, or might be boosted by * interactivity modifiers. Will be RT if the task got * interactivity modifiers. Will be RT if the task got * RT-boosted. If not then it returns p->normal_prio. * RT-boosted. If not then it returns p->normal_prio. */ */ static int effective_prio(struct task_struct *p) static int effective_prio(struct task_struct *p) { { p->normal_prio = normal_prio(p); p->normal_prio = normal_prio(p); /* /* * If we are RT tasks or we were boosted to RT priori * If we are RT tasks or we were boosted to RT priori * keep the priority unchanged. Otherwise, update pri * keep the priority unchanged. Otherwise, update pri * to the normal priority: * to the normal priority: */ */ if (!rt_prio(p->prio)) if (!rt_prio(p->prio)) return p->normal_prio; return p->normal_prio; return p->prio; return p->prio; } } /** /** * task_curr - is this task currently executing on a CPU? * task_curr - is this task currently executing on a CPU? * @p: the task in question. * @p: the task in question. * * * Return: 1 if the task is currently executing. 0 otherwise. * Return: 1 if the task is currently executing. 0 otherwise. */ */ inline int task_curr(const struct task_struct *p) inline int task_curr(const struct task_struct *p) { { return cpu_curr(task_cpu(p)) == p; return cpu_curr(task_cpu(p)) == p; } } /* /* * switched_from, switched_to and prio_changed must _NOT_ dro * switched_from, switched_to and prio_changed must _NOT_ dro * use the balance_callback list if you want balancing. * use the balance_callback list if you want balancing. * * * this means any call to check_class_changed() must be follo * this means any call to check_class_changed() must be follo * balance_callback(). * balance_callback(). */ */ static inline void check_class_changed(struct rq *rq, struct static inline void check_class_changed(struct rq *rq, struct const struct sched_cla const struct sched_cla int oldprio) int oldprio) { { if (prev_class != p->sched_class) { if (prev_class != p->sched_class) { if (prev_class->switched_from) if (prev_class->switched_from) prev_class->switched_from(rq, p); prev_class->switched_from(rq, p); p->sched_class->switched_to(rq, p); p->sched_class->switched_to(rq, p); } else if (oldprio != p->prio || dl_task(p)) } else if (oldprio != p->prio || dl_task(p)) p->sched_class->prio_changed(rq, p, oldprio); p->sched_class->prio_changed(rq, p, oldprio); } } void check_preempt_curr(struct rq *rq, struct task_struct *p, void check_preempt_curr(struct rq *rq, struct task_struct *p, { { if (p->sched_class == rq->curr->sched_class) if (p->sched_class == rq->curr->sched_class) rq->curr->sched_class->check_preempt_curr(rq, rq->curr->sched_class->check_preempt_curr(rq, else if (sched_class_above(p->sched_class, rq->curr-> else if (sched_class_above(p->sched_class, rq->curr-> resched_curr(rq); resched_curr(rq); /* /* * A queue event has occurred, and we're going to sch * A queue event has occurred, and we're going to sch * this case, we can save a useless back to back cloc * this case, we can save a useless back to back cloc */ */ if (task_on_rq_queued(rq->curr) && test_tsk_need_resc if (task_on_rq_queued(rq->curr) && test_tsk_need_resc rq_clock_skip_update(rq); rq_clock_skip_update(rq); } } static __always_inline static __always_inline int __task_state_match(struct task_struct *p, unsigned int st int __task_state_match(struct task_struct *p, unsigned int st { { if (READ_ONCE(p->__state) & state) if (READ_ONCE(p->__state) & state) return 1; return 1; #ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT if (READ_ONCE(p->saved_state) & state) if (READ_ONCE(p->saved_state) & state) return -1; return -1; #endif #endif return 0; return 0; } } static __always_inline static __always_inline int task_state_match(struct task_struct *p, unsigned int stat int task_state_match(struct task_struct *p, unsigned int stat { { #ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT int match; int match; /* /* * Serialize against current_save_and_set_rtlock_wait * Serialize against current_save_and_set_rtlock_wait * current_restore_rtlock_saved_state(). * current_restore_rtlock_saved_state(). */ */ raw_spin_lock_irq(&p->pi_lock); raw_spin_lock_irq(&p->pi_lock); match = __task_state_match(p, state); match = __task_state_match(p, state); raw_spin_unlock_irq(&p->pi_lock); raw_spin_unlock_irq(&p->pi_lock); return match; return match; #else #else return __task_state_match(p, state); return __task_state_match(p, state); #endif #endif } } /* /* * wait_task_inactive - wait for a thread to unschedule. * wait_task_inactive - wait for a thread to unschedule. * * * Wait for the thread to block in any of the states set in @ * Wait for the thread to block in any of the states set in @ * If it changes, i.e. @p might have woken up, then return ze * If it changes, i.e. @p might have woken up, then return ze * succeed in waiting for @p to be off its CPU, we return a p * succeed in waiting for @p to be off its CPU, we return a p * (its total switch count). If a second call a short while * (its total switch count). If a second call a short while * same number, the caller can be sure that @p has remained u * same number, the caller can be sure that @p has remained u * whole time. * whole time. * * * The caller must ensure that the task *will* unschedule som * The caller must ensure that the task *will* unschedule som * else this function might spin for a *long* time. This func * else this function might spin for a *long* time. This func * be called with interrupts off, or it may introduce deadloc * be called with interrupts off, or it may introduce deadloc * smp_call_function() if an IPI is sent by the same process * smp_call_function() if an IPI is sent by the same process * waiting to become inactive. * waiting to become inactive. */ */ unsigned long wait_task_inactive(struct task_struct *p, unsig unsigned long wait_task_inactive(struct task_struct *p, unsig { { int running, queued, match; int running, queued, match; struct rq_flags rf; struct rq_flags rf; unsigned long ncsw; unsigned long ncsw; struct rq *rq; struct rq *rq; for (;;) { for (;;) { /* /* * We do the initial early heuristics without * We do the initial early heuristics without * any task-queue locks at all. We'll only tr * any task-queue locks at all. We'll only tr * the runqueue lock when things look like th * the runqueue lock when things look like th * work out! * work out! */ */ rq = task_rq(p); rq = task_rq(p); /* /* * If the task is actively running on another * If the task is actively running on another * still, just relax and busy-wait without ho * still, just relax and busy-wait without ho * any locks. * any locks. * * * NOTE! Since we don't hold any locks, it's * NOTE! Since we don't hold any locks, it's * even sure that "rq" stays as the right run * even sure that "rq" stays as the right run * But we don't care, since "task_on_cpu()" w * But we don't care, since "task_on_cpu()" w * return false if the runqueue has changed a * return false if the runqueue has changed a * is actually now running somewhere else! * is actually now running somewhere else! */ */ while (task_on_cpu(rq, p)) { while (task_on_cpu(rq, p)) { if (!task_state_match(p, match_state) if (!task_state_match(p, match_state) return 0; return 0; cpu_relax(); cpu_relax(); } } /* /* * Ok, time to look more closely! We need the * Ok, time to look more closely! We need the * lock now, to be *sure*. If we're wrong, we * lock now, to be *sure*. If we're wrong, we * just go back and repeat. * just go back and repeat. */ */ rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); trace_sched_wait_task(p); trace_sched_wait_task(p); running = task_on_cpu(rq, p); running = task_on_cpu(rq, p); queued = task_on_rq_queued(p); queued = task_on_rq_queued(p); ncsw = 0; ncsw = 0; if ((match = __task_state_match(p, match_stat if ((match = __task_state_match(p, match_stat /* /* * When matching on p->saved_state, c * When matching on p->saved_state, c * still queued so it will wait. * still queued so it will wait. */ */ if (match < 0) if (match < 0) queued = 1; queued = 1; ncsw = p->nvcsw | LONG_MIN; /* sets M ncsw = p->nvcsw | LONG_MIN; /* sets M } } task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); /* /* * If it changed from the expected state, bai * If it changed from the expected state, bai */ */ if (unlikely(!ncsw)) if (unlikely(!ncsw)) break; break; /* /* * Was it really running after all now that w * Was it really running after all now that w * checked with the proper locks actually hel * checked with the proper locks actually hel * * * Oops. Go back and try again.. * Oops. Go back and try again.. */ */ if (unlikely(running)) { if (unlikely(running)) { cpu_relax(); cpu_relax(); continue; continue; } } /* /* * It's not enough that it's not actively run * It's not enough that it's not actively run * it must be off the runqueue _entirely_, an * it must be off the runqueue _entirely_, an * preempted! * preempted! * * * So if it was still runnable (but just not * So if it was still runnable (but just not * running right now), it's preempted, and we * running right now), it's preempted, and we * yield - it could be a while. * yield - it could be a while. */ */ if (unlikely(queued)) { if (unlikely(queued)) { ktime_t to = NSEC_PER_SEC / HZ; ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBL set_current_state(TASK_UNINTERRUPTIBL schedule_hrtimeout(&to, HRTIMER_MODE_ schedule_hrtimeout(&to, HRTIMER_MODE_ continue; continue; } } /* /* * Ahh, all good. It wasn't running, and it w * Ahh, all good. It wasn't running, and it w * runnable, which means that it will never b * runnable, which means that it will never b * running in the future either. We're all do * running in the future either. We're all do */ */ break; break; } } return ncsw; return ncsw; } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP static void static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_ __do_set_cpus_allowed(struct task_struct *p, struct affinity_ static int __set_cpus_allowed_ptr(struct task_struct *p, static int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ct struct affinity_context *ct static void migrate_disable_switch(struct rq *rq, struct task static void migrate_disable_switch(struct rq *rq, struct task { { struct affinity_context ac = { struct affinity_context ac = { .new_mask = cpumask_of(rq->cpu), .new_mask = cpumask_of(rq->cpu), .flags = SCA_MIGRATE_DISABLE, .flags = SCA_MIGRATE_DISABLE, }; }; if (likely(!p->migration_disabled)) if (likely(!p->migration_disabled)) return; return; if (p->cpus_ptr != &p->cpus_mask) if (p->cpus_ptr != &p->cpus_mask) return; return; /* /* * Violates locking rules! see comment in __do_set_cp * Violates locking rules! see comment in __do_set_cp */ */ __do_set_cpus_allowed(p, &ac); __do_set_cpus_allowed(p, &ac); } } void migrate_disable(void) void migrate_disable(void) { { struct task_struct *p = current; struct task_struct *p = current; if (p->migration_disabled) { if (p->migration_disabled) { p->migration_disabled++; p->migration_disabled++; return; return; } } preempt_disable(); preempt_disable(); this_rq()->nr_pinned++; this_rq()->nr_pinned++; p->migration_disabled = 1; p->migration_disabled = 1; preempt_enable(); preempt_enable(); } } EXPORT_SYMBOL_GPL(migrate_disable); EXPORT_SYMBOL_GPL(migrate_disable); void migrate_enable(void) void migrate_enable(void) { { struct task_struct *p = current; struct task_struct *p = current; struct affinity_context ac = { struct affinity_context ac = { .new_mask = &p->cpus_mask, .new_mask = &p->cpus_mask, .flags = SCA_MIGRATE_ENABLE, .flags = SCA_MIGRATE_ENABLE, }; }; if (p->migration_disabled > 1) { if (p->migration_disabled > 1) { p->migration_disabled--; p->migration_disabled--; return; return; } } if (WARN_ON_ONCE(!p->migration_disabled)) if (WARN_ON_ONCE(!p->migration_disabled)) return; return; /* /* * Ensure stop_task runs either before or after this, * Ensure stop_task runs either before or after this, * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't */ */ preempt_disable(); preempt_disable(); if (p->cpus_ptr != &p->cpus_mask) if (p->cpus_ptr != &p->cpus_mask) __set_cpus_allowed_ptr(p, &ac); __set_cpus_allowed_ptr(p, &ac); /* /* * Mustn't clear migration_disabled() until cpus_ptr * Mustn't clear migration_disabled() until cpus_ptr * regular cpus_mask, otherwise things that race (eg. * regular cpus_mask, otherwise things that race (eg. * select_fallback_rq) get confused. * select_fallback_rq) get confused. */ */ barrier(); barrier(); p->migration_disabled = 0; p->migration_disabled = 0; this_rq()->nr_pinned--; this_rq()->nr_pinned--; preempt_enable(); preempt_enable(); } } EXPORT_SYMBOL_GPL(migrate_enable); EXPORT_SYMBOL_GPL(migrate_enable); static inline bool rq_has_pinned_tasks(struct rq *rq) static inline bool rq_has_pinned_tasks(struct rq *rq) { { return rq->nr_pinned; return rq->nr_pinned; } } /* /* * Per-CPU kthreads are allowed to run on !active && online C * Per-CPU kthreads are allowed to run on !active && online C * __set_cpus_allowed_ptr() and select_fallback_rq(). * __set_cpus_allowed_ptr() and select_fallback_rq(). */ */ static inline bool is_cpu_allowed(struct task_struct *p, int static inline bool is_cpu_allowed(struct task_struct *p, int { { /* When not in the task's cpumask, no point in lookin /* When not in the task's cpumask, no point in lookin if (!cpumask_test_cpu(cpu, p->cpus_ptr)) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; return false; /* migrate_disabled() must be allowed to finish. */ /* migrate_disabled() must be allowed to finish. */ if (is_migration_disabled(p)) if (is_migration_disabled(p)) return cpu_online(cpu); return cpu_online(cpu); /* Non kernel threads are not allowed during either o /* Non kernel threads are not allowed during either o if (!(p->flags & PF_KTHREAD)) if (!(p->flags & PF_KTHREAD)) return cpu_active(cpu) && task_cpu_possible(c return cpu_active(cpu) && task_cpu_possible(c /* KTHREAD_IS_PER_CPU is always allowed. */ /* KTHREAD_IS_PER_CPU is always allowed. */ if (kthread_is_per_cpu(p)) if (kthread_is_per_cpu(p)) return cpu_online(cpu); return cpu_online(cpu); /* Regular kernel threads don't get to stay during of /* Regular kernel threads don't get to stay during of if (cpu_dying(cpu)) if (cpu_dying(cpu)) return false; return false; /* But are allowed during online. */ /* But are allowed during online. */ return cpu_online(cpu); return cpu_online(cpu); } } /* /* * This is how migration works: * This is how migration works: * * * 1) we invoke migration_cpu_stop() on the target CPU using * 1) we invoke migration_cpu_stop() on the target CPU using * stop_one_cpu(). * stop_one_cpu(). * 2) stopper starts to run (implicitly forcing the migrated * 2) stopper starts to run (implicitly forcing the migrated * off the CPU) * off the CPU) * 3) it checks whether the migrated task is still in the wro * 3) it checks whether the migrated task is still in the wro * 4) if it's in the wrong runqueue then the migration thread * 4) if it's in the wrong runqueue then the migration thread * it and puts it into the right queue. * it and puts it into the right queue. * 5) stopper completes and stop_one_cpu() returns and the mi * 5) stopper completes and stop_one_cpu() returns and the mi * is done. * is done. */ */ /* /* * move_queued_task - move a queued task to new rq. * move_queued_task - move a queued task to new rq. * * * Returns (locked) new rq. Old rq's lock is released. * Returns (locked) new rq. Old rq's lock is released. */ */ static struct rq *move_queued_task(struct rq *rq, struct rq_f static struct rq *move_queued_task(struct rq *rq, struct rq_f struct task_struct *p, int struct task_struct *p, int { { lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); deactivate_task(rq, p, DEQUEUE_NOCLOCK); deactivate_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); set_task_cpu(p, new_cpu); rq_unlock(rq, rf); rq_unlock(rq, rf); rq = cpu_rq(new_cpu); rq = cpu_rq(new_cpu); rq_lock(rq, rf); rq_lock(rq, rf); WARN_ON_ONCE(task_cpu(p) != new_cpu); WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); activate_task(rq, p, 0); check_preempt_curr(rq, p, 0); check_preempt_curr(rq, p, 0); return rq; return rq; } } struct migration_arg { struct migration_arg { struct task_struct *task; struct task_struct *task; int dest_cpu; int dest_cpu; struct set_affinity_pending *pending; struct set_affinity_pending *pending; }; }; /* /* * @refs: number of wait_for_completion() * @refs: number of wait_for_completion() * @stop_pending: is @stop_work in use * @stop_pending: is @stop_work in use */ */ struct set_affinity_pending { struct set_affinity_pending { refcount_t refs; refcount_t refs; unsigned int stop_pending; unsigned int stop_pending; struct completion done; struct completion done; struct cpu_stop_work stop_work; struct cpu_stop_work stop_work; struct migration_arg arg; struct migration_arg arg; }; }; /* /* * Move (not current) task off this CPU, onto the destination * Move (not current) task off this CPU, onto the destination * this because either it can't run here any more (set_cpus_a * this because either it can't run here any more (set_cpus_a * away from this CPU, or CPU going down), or because we're * away from this CPU, or CPU going down), or because we're * attempting to rebalance this task on exec (sched_exec). * attempting to rebalance this task on exec (sched_exec). * * * So we race with normal scheduler movements, but that's OK, * So we race with normal scheduler movements, but that's OK, * as the task is no longer on this CPU. * as the task is no longer on this CPU. */ */ static struct rq *__migrate_task(struct rq *rq, struct rq_fla static struct rq *__migrate_task(struct rq *rq, struct rq_fla struct task_struct *p, int d struct task_struct *p, int d { { /* Affinity changed (again). */ /* Affinity changed (again). */ if (!is_cpu_allowed(p, dest_cpu)) if (!is_cpu_allowed(p, dest_cpu)) return rq; return rq; rq = move_queued_task(rq, rf, p, dest_cpu); rq = move_queued_task(rq, rf, p, dest_cpu); return rq; return rq; } } /* /* * migration_cpu_stop - this will be executed by a highprio s * migration_cpu_stop - this will be executed by a highprio s * and performs thread migration by bumping thread off CPU th * and performs thread migration by bumping thread off CPU th * 'pushing' onto another runqueue. * 'pushing' onto another runqueue. */ */ static int migration_cpu_stop(void *data) static int migration_cpu_stop(void *data) { { struct migration_arg *arg = data; struct migration_arg *arg = data; struct set_affinity_pending *pending = arg->pending; struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; struct task_struct *p = arg->task; struct rq *rq = this_rq(); struct rq *rq = this_rq(); bool complete = false; bool complete = false; struct rq_flags rf; struct rq_flags rf; /* /* * The original target CPU might have gone down and w * The original target CPU might have gone down and w * be on another CPU but it doesn't matter. * be on another CPU but it doesn't matter. */ */ local_irq_save(rf.flags); local_irq_save(rf.flags); /* /* * We need to explicitly wake pending tasks before ru * We need to explicitly wake pending tasks before ru * __migrate_task() such that we will not miss enforc * __migrate_task() such that we will not miss enforc * during wakeups, see set_cpus_allowed_ptr()'s TASK_ * during wakeups, see set_cpus_allowed_ptr()'s TASK_ */ */ flush_smp_call_function_queue(); flush_smp_call_function_queue(); raw_spin_lock(&p->pi_lock); raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); rq_lock(rq, &rf); /* /* * If we were passed a pending, then ->stop_pending w * If we were passed a pending, then ->stop_pending w * p->migration_pending must have remained stable. * p->migration_pending must have remained stable. */ */ WARN_ON_ONCE(pending && pending != p->migration_pendi WARN_ON_ONCE(pending && pending != p->migration_pendi /* /* * If task_rq(p) != rq, it cannot be migrated here, b * If task_rq(p) != rq, it cannot be migrated here, b * holding rq->lock, if p->on_rq == 0 it cannot get e * holding rq->lock, if p->on_rq == 0 it cannot get e * we're holding p->pi_lock. * we're holding p->pi_lock. */ */ if (task_rq(p) == rq) { if (task_rq(p) == rq) { if (is_migration_disabled(p)) if (is_migration_disabled(p)) goto out; goto out; if (pending) { if (pending) { p->migration_pending = NULL; p->migration_pending = NULL; complete = true; complete = true; if (cpumask_test_cpu(task_cpu(p), &p- if (cpumask_test_cpu(task_cpu(p), &p- goto out; goto out; } } if (task_on_rq_queued(p)) { if (task_on_rq_queued(p)) { update_rq_clock(rq); update_rq_clock(rq); rq = __migrate_task(rq, &rf, p, arg-> rq = __migrate_task(rq, &rf, p, arg-> } else { } else { p->wake_cpu = arg->dest_cpu; p->wake_cpu = arg->dest_cpu; } } /* /* * XXX __migrate_task() can fail, at which po * XXX __migrate_task() can fail, at which po * up running on a dodgy CPU, AFAICT this can * up running on a dodgy CPU, AFAICT this can * during CPU hotplug, at which point we'll g * during CPU hotplug, at which point we'll g * anyway, so it's probably not a big deal. * anyway, so it's probably not a big deal. */ */ } else if (pending) { } else if (pending) { /* /* * This happens when we get migrated between * This happens when we get migrated between * preempt_enable() and scheduling the stoppe * preempt_enable() and scheduling the stoppe * point we're a regular task again and not c * point we're a regular task again and not c * * * A !PREEMPT kernel has a giant hole here, w * A !PREEMPT kernel has a giant hole here, w * more likely. * more likely. */ */ /* /* * The task moved before the stopper got to r * The task moved before the stopper got to r * ->pi_lock, so the allowed mask is stable - * ->pi_lock, so the allowed mask is stable - * somewhere allowed, we're done. * somewhere allowed, we're done. */ */ if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr p->migration_pending = NULL; p->migration_pending = NULL; complete = true; complete = true; goto out; goto out; } } /* /* * When migrate_enable() hits a rq mis-match * When migrate_enable() hits a rq mis-match * determine is_migration_disabled() and so h * determine is_migration_disabled() and so h * it. * it. */ */ WARN_ON_ONCE(!pending->stop_pending); WARN_ON_ONCE(!pending->stop_pending); preempt_disable(); preempt_disable(); task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cp stop_one_cpu_nowait(task_cpu(p), migration_cp &pending->arg, &pending-> &pending->arg, &pending-> preempt_enable(); preempt_enable(); return 0; return 0; } } out: out: if (pending) if (pending) pending->stop_pending = false; pending->stop_pending = false; task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); if (complete) if (complete) complete_all(&pending->done); complete_all(&pending->done); return 0; return 0; } } int push_cpu_stop(void *arg) int push_cpu_stop(void *arg) { { struct rq *lowest_rq = NULL, *rq = this_rq(); struct rq *lowest_rq = NULL, *rq = this_rq(); struct task_struct *p = arg; struct task_struct *p = arg; raw_spin_lock_irq(&p->pi_lock); raw_spin_lock_irq(&p->pi_lock); raw_spin_rq_lock(rq); raw_spin_rq_lock(rq); if (task_rq(p) != rq) if (task_rq(p) != rq) goto out_unlock; goto out_unlock; if (is_migration_disabled(p)) { if (is_migration_disabled(p)) { p->migration_flags |= MDF_PUSH; p->migration_flags |= MDF_PUSH; goto out_unlock; goto out_unlock; } } p->migration_flags &= ~MDF_PUSH; p->migration_flags &= ~MDF_PUSH; if (p->sched_class->find_lock_rq) if (p->sched_class->find_lock_rq) lowest_rq = p->sched_class->find_lock_rq(p, r lowest_rq = p->sched_class->find_lock_rq(p, r if (!lowest_rq) if (!lowest_rq) goto out_unlock; goto out_unlock; // XXX validate p is still the highest prio task // XXX validate p is still the highest prio task if (task_rq(p) == rq) { if (task_rq(p) == rq) { deactivate_task(rq, p, 0); deactivate_task(rq, p, 0); set_task_cpu(p, lowest_rq->cpu); set_task_cpu(p, lowest_rq->cpu); activate_task(lowest_rq, p, 0); activate_task(lowest_rq, p, 0); resched_curr(lowest_rq); resched_curr(lowest_rq); } } double_unlock_balance(rq, lowest_rq); double_unlock_balance(rq, lowest_rq); out_unlock: out_unlock: rq->push_busy = false; rq->push_busy = false; raw_spin_rq_unlock(rq); raw_spin_rq_unlock(rq); raw_spin_unlock_irq(&p->pi_lock); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); put_task_struct(p); return 0; return 0; } } /* /* * sched_class::set_cpus_allowed must do the below, but is no * sched_class::set_cpus_allowed must do the below, but is no * actually call this function. * actually call this function. */ */ void set_cpus_allowed_common(struct task_struct *p, struct af void set_cpus_allowed_common(struct task_struct *p, struct af { { if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DI if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DI p->cpus_ptr = ctx->new_mask; p->cpus_ptr = ctx->new_mask; return; return; } } cpumask_copy(&p->cpus_mask, ctx->new_mask); cpumask_copy(&p->cpus_mask, ctx->new_mask); p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); /* /* * Swap in a new user_cpus_ptr if SCA_USER flag set * Swap in a new user_cpus_ptr if SCA_USER flag set */ */ if (ctx->flags & SCA_USER) if (ctx->flags & SCA_USER) swap(p->user_cpus_ptr, ctx->user_mask); swap(p->user_cpus_ptr, ctx->user_mask); } } static void static void __do_set_cpus_allowed(struct task_struct *p, struct affinity_ __do_set_cpus_allowed(struct task_struct *p, struct affinity_ { { struct rq *rq = task_rq(p); struct rq *rq = task_rq(p); bool queued, running; bool queued, running; /* /* * This here violates the locking rules for affinity, * This here violates the locking rules for affinity, * supposed to change these variables while holding b * supposed to change these variables while holding b * p->pi_lock. * p->pi_lock. * * * HOWEVER, it magically works, because ttwu() is the * HOWEVER, it magically works, because ttwu() is the * accesses these variables under p->pi_lock and only * accesses these variables under p->pi_lock and only * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're * before finish_task(). * before finish_task(). * * * XXX do further audits, this smells like something * XXX do further audits, this smells like something */ */ if (ctx->flags & SCA_MIGRATE_DISABLE) if (ctx->flags & SCA_MIGRATE_DISABLE) SCHED_WARN_ON(!p->on_cpu); SCHED_WARN_ON(!p->on_cpu); else else lockdep_assert_held(&p->pi_lock); lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); queued = task_on_rq_queued(p); running = task_current(rq, p); running = task_current(rq, p); if (queued) { if (queued) { /* /* * Because __kthread_bind() calls this on blo * Because __kthread_bind() calls this on blo * holding rq->lock. * holding rq->lock. */ */ lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO } } if (running) if (running) put_prev_task(rq, p); put_prev_task(rq, p); p->sched_class->set_cpus_allowed(p, ctx); p->sched_class->set_cpus_allowed(p, ctx); if (queued) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE if (running) if (running) set_next_task(rq, p); set_next_task(rq, p); } } /* /* * Used for kthread_bind() and select_fallback_rq(), in both * Used for kthread_bind() and select_fallback_rq(), in both * affinity (if any) should be destroyed too. * affinity (if any) should be destroyed too. */ */ void do_set_cpus_allowed(struct task_struct *p, const struct void do_set_cpus_allowed(struct task_struct *p, const struct { { struct affinity_context ac = { struct affinity_context ac = { .new_mask = new_mask, .new_mask = new_mask, .user_mask = NULL, .user_mask = NULL, .flags = SCA_USER, /* clear the user req .flags = SCA_USER, /* clear the user req }; }; union cpumask_rcuhead { union cpumask_rcuhead { cpumask_t cpumask; cpumask_t cpumask; struct rcu_head rcu; struct rcu_head rcu; }; }; __do_set_cpus_allowed(p, &ac); __do_set_cpus_allowed(p, &ac); /* /* * Because this is called with p->pi_lock held, it is * Because this is called with p->pi_lock held, it is * to use kfree() here (when PREEMPT_RT=y), therefore * to use kfree() here (when PREEMPT_RT=y), therefore * kfree_rcu(). * kfree_rcu(). */ */ kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu) kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu) } } static cpumask_t *alloc_user_cpus_ptr(int node) static cpumask_t *alloc_user_cpus_ptr(int node) { { /* /* * See do_set_cpus_allowed() above for the rcu_head u * See do_set_cpus_allowed() above for the rcu_head u */ */ int size = max_t(int, cpumask_size(), sizeof(struct r int size = max_t(int, cpumask_size(), sizeof(struct r return kmalloc_node(size, GFP_KERNEL, node); return kmalloc_node(size, GFP_KERNEL, node); } } int dup_user_cpus_ptr(struct task_struct *dst, struct task_st int dup_user_cpus_ptr(struct task_struct *dst, struct task_st int node) int node) { { cpumask_t *user_mask; cpumask_t *user_mask; unsigned long flags; unsigned long flags; /* /* * Always clear dst->user_cpus_ptr first as their use * Always clear dst->user_cpus_ptr first as their use * may differ by now due to racing. * may differ by now due to racing. */ */ dst->user_cpus_ptr = NULL; dst->user_cpus_ptr = NULL; /* /* * This check is racy and losing the race is a valid * This check is racy and losing the race is a valid * It is not worth the extra overhead of taking the p * It is not worth the extra overhead of taking the p * every fork/clone. * every fork/clone. */ */ if (data_race(!src->user_cpus_ptr)) if (data_race(!src->user_cpus_ptr)) return 0; return 0; user_mask = alloc_user_cpus_ptr(node); user_mask = alloc_user_cpus_ptr(node); if (!user_mask) if (!user_mask) return -ENOMEM; return -ENOMEM; /* /* * Use pi_lock to protect content of user_cpus_ptr * Use pi_lock to protect content of user_cpus_ptr * * * Though unlikely, user_cpus_ptr can be reset to NUL * Though unlikely, user_cpus_ptr can be reset to NUL * do_set_cpus_allowed(). * do_set_cpus_allowed(). */ */ raw_spin_lock_irqsave(&src->pi_lock, flags); raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { if (src->user_cpus_ptr) { swap(dst->user_cpus_ptr, user_mask); swap(dst->user_cpus_ptr, user_mask); cpumask_copy(dst->user_cpus_ptr, src->user_cp cpumask_copy(dst->user_cpus_ptr, src->user_cp } } raw_spin_unlock_irqrestore(&src->pi_lock, flags); raw_spin_unlock_irqrestore(&src->pi_lock, flags); if (unlikely(user_mask)) if (unlikely(user_mask)) kfree(user_mask); kfree(user_mask); return 0; return 0; } } static inline struct cpumask *clear_user_cpus_ptr(struct task static inline struct cpumask *clear_user_cpus_ptr(struct task { { struct cpumask *user_mask = NULL; struct cpumask *user_mask = NULL; swap(p->user_cpus_ptr, user_mask); swap(p->user_cpus_ptr, user_mask); return user_mask; return user_mask; } } void release_user_cpus_ptr(struct task_struct *p) void release_user_cpus_ptr(struct task_struct *p) { { kfree(clear_user_cpus_ptr(p)); kfree(clear_user_cpus_ptr(p)); } } /* /* * This function is wildly self concurrent; here be dragons. * This function is wildly self concurrent; here be dragons. * * * * * When given a valid mask, __set_cpus_allowed_ptr() must blo * When given a valid mask, __set_cpus_allowed_ptr() must blo * designated task is enqueued on an allowed CPU. If that tas * designated task is enqueued on an allowed CPU. If that tas * running, we have to kick it out using the CPU stopper. * running, we have to kick it out using the CPU stopper. * * * Migrate-Disable comes along and tramples all over our nice * Migrate-Disable comes along and tramples all over our nice * Consider: * Consider: * * * Initial conditions: P0->cpus_mask = [0, 1] * Initial conditions: P0->cpus_mask = [0, 1] * * * P0@CPU0 P1 * P0@CPU0 P1 * * * migrate_disable(); * migrate_disable(); * * * set_cpus_allowed_ptr(P0, [1]) * set_cpus_allowed_ptr(P0, [1]) * * * P1 *cannot* return from this set_cpus_allowed_ptr() call u * P1 *cannot* return from this set_cpus_allowed_ptr() call u * its outermost migrate_enable() (i.e. it exits its Migrate- * its outermost migrate_enable() (i.e. it exits its Migrate- * This means we need the following scheme: * This means we need the following scheme: * * * P0@CPU0 P1 * P0@CPU0 P1 * * * migrate_disable(); * migrate_disable(); * * * set_cpus_allowed_ptr(P0, [1]) * set_cpus_allowed_ptr(P0, [1]) * * * * * migrate_enable(); * migrate_enable(); * __set_cpus_allowed_ptr(); * __set_cpus_allowed_ptr(); * * * `--> pi_lock, which we can leverage * task p are serialized by p->pi_lock, which we can leverage * should come into effect at the end of the Migrate-Disable * should come into effect at the end of the Migrate-Disable * one. This means we only need to track a single cpumask (i. * one. This means we only need to track a single cpumask (i. * but we still need to properly signal those waiting tasks a * but we still need to properly signal those waiting tasks a * moment. * moment. * * * This is implemented using struct set_affinity_pending. The * This is implemented using struct set_affinity_pending. The * __set_cpus_allowed_ptr() caller within a given Migrate-Dis * __set_cpus_allowed_ptr() caller within a given Migrate-Dis * setup an instance of that struct and install it on the tar * setup an instance of that struct and install it on the tar * Any and all further callers will reuse that instance. Thos * Any and all further callers will reuse that instance. Thos * a completion signaled at the tail of the CPU stopper callb * a completion signaled at the tail of the CPU stopper callb * on the end of the Migrate-Disable region (i.e. outermost m * on the end of the Migrate-Disable region (i.e. outermost m * * * * * (1) In the cases covered above. There is one more where th * (1) In the cases covered above. There is one more where th * signaled within affine_move_task() itself: when a subseque * signaled within affine_move_task() itself: when a subseque * occurs after the stopper bailed out due to the targeted ta * occurs after the stopper bailed out due to the targeted ta * Migrate-Disable. Consider: * Migrate-Disable. Consider: * * * Initial conditions: P0->cpus_mask = [0, 1] * Initial conditions: P0->cpus_mask = [0, 1] * * * CPU0 P1 P2 * CPU0 P1 P2 * * * migrate_disable(); * migrate_disable(); * * * set_cpus_allowed_ptr(P0, [1]); * set_cpus_allowed_ptr(P0, [1]); * * * * * migration_cpu_stop() * migration_cpu_stop() * is_migration_disabled() * is_migration_disabled() * * * set_ * set_ * * * * * Note that the above is safe vs a concurrent migrate_enable * Note that the above is safe vs a concurrent migrate_enable * pending affinity completion is preceded by an uninstallati * pending affinity completion is preceded by an uninstallati * p->migration_pending done with p->pi_lock held. * p->migration_pending done with p->pi_lock held. */ */ static int affine_move_task(struct rq *rq, struct task_struct static int affine_move_task(struct rq *rq, struct task_struct int dest_cpu, unsigned int flags) int dest_cpu, unsigned int flags) __releases(rq->lock) __releases(rq->lock) __releases(p->pi_lock) __releases(p->pi_lock) { { struct set_affinity_pending my_pending = { }, *pendin struct set_affinity_pending my_pending = { }, *pendin bool stop_pending, complete = false; bool stop_pending, complete = false; /* Can the task run on the task's current CPU? If so, /* Can the task run on the task's current CPU? If so, if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { struct task_struct *push_task = NULL; struct task_struct *push_task = NULL; if ((flags & SCA_MIGRATE_ENABLE) && if ((flags & SCA_MIGRATE_ENABLE) && (p->migration_flags & MDF_PUSH) && !rq->p (p->migration_flags & MDF_PUSH) && !rq->p rq->push_busy = true; rq->push_busy = true; push_task = get_task_struct(p); push_task = get_task_struct(p); } } /* /* * If there are pending waiters, but no pendi * If there are pending waiters, but no pendi * then complete now. * then complete now. */ */ pending = p->migration_pending; pending = p->migration_pending; if (pending && !pending->stop_pending) { if (pending && !pending->stop_pending) { p->migration_pending = NULL; p->migration_pending = NULL; complete = true; complete = true; } } preempt_disable(); preempt_disable(); task_rq_unlock(rq, p, rf); task_rq_unlock(rq, p, rf); if (push_task) { if (push_task) { stop_one_cpu_nowait(rq->cpu, push_cpu stop_one_cpu_nowait(rq->cpu, push_cpu p, &rq->push_work p, &rq->push_work } } preempt_enable(); preempt_enable(); if (complete) if (complete) complete_all(&pending->done); complete_all(&pending->done); return 0; return 0; } } if (!(flags & SCA_MIGRATE_ENABLE)) { if (!(flags & SCA_MIGRATE_ENABLE)) { /* serialized by p->pi_lock */ /* serialized by p->pi_lock */ if (!p->migration_pending) { if (!p->migration_pending) { /* Install the request */ /* Install the request */ refcount_set(&my_pending.refs, 1); refcount_set(&my_pending.refs, 1); init_completion(&my_pending.done); init_completion(&my_pending.done); my_pending.arg = (struct migration_ar my_pending.arg = (struct migration_ar .task = p, .task = p, .dest_cpu = dest_cpu, .dest_cpu = dest_cpu, .pending = &my_pending, .pending = &my_pending, }; }; p->migration_pending = &my_pending; p->migration_pending = &my_pending; } else { } else { pending = p->migration_pending; pending = p->migration_pending; refcount_inc(&pending->refs); refcount_inc(&pending->refs); /* /* * Affinity has changed, but we've al * Affinity has changed, but we've al * pending. migration_cpu_stop() *mus * pending. migration_cpu_stop() *mus * we risk a completion of the pendin * we risk a completion of the pendin * task on a disallowed CPU. * task on a disallowed CPU. * * * Serialized by p->pi_lock, so this * Serialized by p->pi_lock, so this */ */ pending->arg.dest_cpu = dest_cpu; pending->arg.dest_cpu = dest_cpu; } } } } pending = p->migration_pending; pending = p->migration_pending; /* /* * - !MIGRATE_ENABLE: * - !MIGRATE_ENABLE: * we'll have installed a pending if there wasn't o * we'll have installed a pending if there wasn't o * * * - MIGRATE_ENABLE: * - MIGRATE_ENABLE: * we're here because the current CPU isn't matchin * we're here because the current CPU isn't matchin * the only way that can happen is because of a con * the only way that can happen is because of a con * set_cpus_allowed_ptr() call, which should then s * set_cpus_allowed_ptr() call, which should then s * pending completion. * pending completion. * * * Either way, we really should have a @pending here. * Either way, we really should have a @pending here. */ */ if (WARN_ON_ONCE(!pending)) { if (WARN_ON_ONCE(!pending)) { task_rq_unlock(rq, p, rf); task_rq_unlock(rq, p, rf); return -EINVAL; return -EINVAL; } } if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TA if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TA /* /* * MIGRATE_ENABLE gets here because 'p == cur * MIGRATE_ENABLE gets here because 'p == cur * anything else we cannot do is_migration_di * anything else we cannot do is_migration_di * and have the stopper function handle it al * and have the stopper function handle it al */ */ stop_pending = pending->stop_pending; stop_pending = pending->stop_pending; if (!stop_pending) if (!stop_pending) pending->stop_pending = true; pending->stop_pending = true; if (flags & SCA_MIGRATE_ENABLE) if (flags & SCA_MIGRATE_ENABLE) p->migration_flags &= ~MDF_PUSH; p->migration_flags &= ~MDF_PUSH; preempt_disable(); preempt_disable(); task_rq_unlock(rq, p, rf); task_rq_unlock(rq, p, rf); if (!stop_pending) { if (!stop_pending) { stop_one_cpu_nowait(cpu_of(rq), migra stop_one_cpu_nowait(cpu_of(rq), migra &pending->arg, &p &pending->arg, &p } } preempt_enable(); preempt_enable(); if (flags & SCA_MIGRATE_ENABLE) if (flags & SCA_MIGRATE_ENABLE) return 0; return 0; } else { } else { if (!is_migration_disabled(p)) { if (!is_migration_disabled(p)) { if (task_on_rq_queued(p)) if (task_on_rq_queued(p)) rq = move_queued_task(rq, rf, rq = move_queued_task(rq, rf, if (!pending->stop_pending) { if (!pending->stop_pending) { p->migration_pending = NULL; p->migration_pending = NULL; complete = true; complete = true; } } } } task_rq_unlock(rq, p, rf); task_rq_unlock(rq, p, rf); if (complete) if (complete) complete_all(&pending->done); complete_all(&pending->done); } } wait_for_completion(&pending->done); wait_for_completion(&pending->done); if (refcount_dec_and_test(&pending->refs)) if (refcount_dec_and_test(&pending->refs)) wake_up_var(&pending->refs); /* No UaF, just wake_up_var(&pending->refs); /* No UaF, just /* /* * Block the original owner of &pending until all sub * Block the original owner of &pending until all sub * have seen the completion and decremented the refco * have seen the completion and decremented the refco */ */ wait_var_event(&my_pending.refs, !refcount_read(&my_p wait_var_event(&my_pending.refs, !refcount_read(&my_p /* ARGH */ /* ARGH */ WARN_ON_ONCE(my_pending.stop_pending); WARN_ON_ONCE(my_pending.stop_pending); return 0; return 0; } } /* /* * Called with both p->pi_lock and rq->lock held; drops both * Called with both p->pi_lock and rq->lock held; drops both */ */ static int __set_cpus_allowed_ptr_locked(struct task_struct * static int __set_cpus_allowed_ptr_locked(struct task_struct * struct affinity_cont struct affinity_cont struct rq *rq, struct rq *rq, struct rq_flags *rf) struct rq_flags *rf) __releases(rq->lock) __releases(rq->lock) __releases(p->pi_lock) __releases(p->pi_lock) { { const struct cpumask *cpu_allowed_mask = task_cpu_pos const struct cpumask *cpu_allowed_mask = task_cpu_pos const struct cpumask *cpu_valid_mask = cpu_active_mas const struct cpumask *cpu_valid_mask = cpu_active_mas bool kthread = p->flags & PF_KTHREAD; bool kthread = p->flags & PF_KTHREAD; unsigned int dest_cpu; unsigned int dest_cpu; int ret = 0; int ret = 0; update_rq_clock(rq); update_rq_clock(rq); if (kthread || is_migration_disabled(p)) { if (kthread || is_migration_disabled(p)) { /* /* * Kernel threads are allowed on online && !a * Kernel threads are allowed on online && !a * however, during cpu-hot-unplug, even these * however, during cpu-hot-unplug, even these * away if not KTHREAD_IS_PER_CPU. * away if not KTHREAD_IS_PER_CPU. * * * Specifically, migration_disabled() tasks m * Specifically, migration_disabled() tasks m * cpumask_any_and_distribute() pick below, e * cpumask_any_and_distribute() pick below, e * SCA_MIGRATE_ENABLE, otherwise we'll not ca * SCA_MIGRATE_ENABLE, otherwise we'll not ca * set_cpus_allowed_common() and actually res * set_cpus_allowed_common() and actually res */ */ cpu_valid_mask = cpu_online_mask; cpu_valid_mask = cpu_online_mask; } } if (!kthread && !cpumask_subset(ctx->new_mask, cpu_al if (!kthread && !cpumask_subset(ctx->new_mask, cpu_al ret = -EINVAL; ret = -EINVAL; goto out; goto out; } } /* /* * Must re-check here, to close a race against __kthr * Must re-check here, to close a race against __kthr * sched_setaffinity() is not guaranteed to observe t * sched_setaffinity() is not guaranteed to observe t */ */ if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SET if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SET ret = -EINVAL; ret = -EINVAL; goto out; goto out; } } if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { if (cpumask_equal(&p->cpus_mask, ctx->new_mas if (cpumask_equal(&p->cpus_mask, ctx->new_mas if (ctx->flags & SCA_USER) if (ctx->flags & SCA_USER) swap(p->user_cpus_ptr, ctx->u swap(p->user_cpus_ptr, ctx->u goto out; goto out; } } if (WARN_ON_ONCE(p == current && if (WARN_ON_ONCE(p == current && is_migration_disabled(p) && is_migration_disabled(p) && !cpumask_test_cpu(task_cpu(p !cpumask_test_cpu(task_cpu(p ret = -EBUSY; ret = -EBUSY; goto out; goto out; } } } } /* /* * Picking a ~random cpu helps in cases where we are * Picking a ~random cpu helps in cases where we are * for groups of tasks (ie. cpuset), so that load bal * for groups of tasks (ie. cpuset), so that load bal * immediately required to distribute the tasks withi * immediately required to distribute the tasks withi */ */ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, if (dest_cpu >= nr_cpu_ids) { if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; ret = -EINVAL; goto out; goto out; } } __do_set_cpus_allowed(p, ctx); __do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->fla return affine_move_task(rq, p, rf, dest_cpu, ctx->fla out: out: task_rq_unlock(rq, p, rf); task_rq_unlock(rq, p, rf); return ret; return ret; } } /* /* * Change a given task's CPU affinity. Migrate the thread to * Change a given task's CPU affinity. Migrate the thread to * proper CPU and schedule it away if the CPU it's executing * proper CPU and schedule it away if the CPU it's executing * is removed from the allowed bitmask. * is removed from the allowed bitmask. * * * NOTE: the caller must have a valid reference to the task, * NOTE: the caller must have a valid reference to the task, * task must not exit() & deallocate itself prematurely. The * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. * call is not atomic; no spinlocks may be held. */ */ static int __set_cpus_allowed_ptr(struct task_struct *p, static int __set_cpus_allowed_ptr(struct task_struct *p, struct affinity_context *ct struct affinity_context *ct { { struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); /* /* * Masking should be skipped if SCA_USER or any of th * Masking should be skipped if SCA_USER or any of th * flags are set. * flags are set. */ */ if (p->user_cpus_ptr && if (p->user_cpus_ptr && !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | S !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | S cpumask_and(rq->scratch_mask, ctx->new_mask, p->u cpumask_and(rq->scratch_mask, ctx->new_mask, p->u ctx->new_mask = rq->scratch_mask; ctx->new_mask = rq->scratch_mask; return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf) return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf) } } int set_cpus_allowed_ptr(struct task_struct *p, const struct int set_cpus_allowed_ptr(struct task_struct *p, const struct { { struct affinity_context ac = { struct affinity_context ac = { .new_mask = new_mask, .new_mask = new_mask, .flags = 0, .flags = 0, }; }; return __set_cpus_allowed_ptr(p, &ac); return __set_cpus_allowed_ptr(p, &ac); } } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* /* * Change a given task's CPU affinity to the intersection of * Change a given task's CPU affinity to the intersection of * affinity mask and @subset_mask, writing the resulting mask * affinity mask and @subset_mask, writing the resulting mask * If user_cpus_ptr is defined, use it as the basis for restr * If user_cpus_ptr is defined, use it as the basis for restr * affinity or use cpu_online_mask instead. * affinity or use cpu_online_mask instead. * * * If the resulting mask is empty, leave the affinity unchang * If the resulting mask is empty, leave the affinity unchang * -EINVAL. * -EINVAL. */ */ static int restrict_cpus_allowed_ptr(struct task_struct *p, static int restrict_cpus_allowed_ptr(struct task_struct *p, struct cpumask *new_mask struct cpumask *new_mask const struct cpumask *su const struct cpumask *su { { struct affinity_context ac = { struct affinity_context ac = { .new_mask = new_mask, .new_mask = new_mask, .flags = 0, .flags = 0, }; }; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; int err; int err; rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); /* /* * Forcefully restricting the affinity of a deadline * Forcefully restricting the affinity of a deadline * likely to cause problems, so fail and noisily over * likely to cause problems, so fail and noisily over * mask entirely. * mask entirely. */ */ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) if (task_has_dl_policy(p) && dl_bandwidth_enabled()) err = -EPERM; err = -EPERM; goto err_unlock; goto err_unlock; } } if (!cpumask_and(new_mask, task_user_cpus(p), subset_ if (!cpumask_and(new_mask, task_user_cpus(p), subset_ err = -EINVAL; err = -EINVAL; goto err_unlock; goto err_unlock; } } return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf) return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf) err_unlock: err_unlock: task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); return err; return err; } } /* /* * Restrict the CPU affinity of task @p so that it is a subse * Restrict the CPU affinity of task @p so that it is a subse * task_cpu_possible_mask() and point @p->user_cpus_ptr to a * task_cpu_possible_mask() and point @p->user_cpus_ptr to a * old affinity mask. If the resulting mask is empty, we warn * old affinity mask. If the resulting mask is empty, we warn * up the cpuset hierarchy until we find a suitable mask. * up the cpuset hierarchy until we find a suitable mask. */ */ void force_compatible_cpus_allowed_ptr(struct task_struct *p) void force_compatible_cpus_allowed_ptr(struct task_struct *p) { { cpumask_var_t new_mask; cpumask_var_t new_mask; const struct cpumask *override_mask = task_cpu_possib const struct cpumask *override_mask = task_cpu_possib alloc_cpumask_var(&new_mask, GFP_KERNEL); alloc_cpumask_var(&new_mask, GFP_KERNEL); /* /* * __migrate_task() can fail silently in the face of * __migrate_task() can fail silently in the face of * offlining of the chosen destination CPU, so take t * offlining of the chosen destination CPU, so take t * lock to ensure that the migration succeeds. * lock to ensure that the migration succeeds. */ */ cpus_read_lock(); cpus_read_lock(); if (!cpumask_available(new_mask)) if (!cpumask_available(new_mask)) goto out_set_mask; goto out_set_mask; if (!restrict_cpus_allowed_ptr(p, new_mask, override_ if (!restrict_cpus_allowed_ptr(p, new_mask, override_ goto out_free_mask; goto out_free_mask; /* /* * We failed to find a valid subset of the affinity m * We failed to find a valid subset of the affinity m * task, so override it based on its cpuset hierarchy * task, so override it based on its cpuset hierarchy */ */ cpuset_cpus_allowed(p, new_mask); cpuset_cpus_allowed(p, new_mask); override_mask = new_mask; override_mask = new_mask; out_set_mask: out_set_mask: if (printk_ratelimit()) { if (printk_ratelimit()) { printk_deferred("Overriding affinity for proc printk_deferred("Overriding affinity for proc task_pid_nr(p), p->comm, task_pid_nr(p), p->comm, cpumask_pr_args(override_mask cpumask_pr_args(override_mask } } WARN_ON(set_cpus_allowed_ptr(p, override_mask)); WARN_ON(set_cpus_allowed_ptr(p, override_mask)); out_free_mask: out_free_mask: cpus_read_unlock(); cpus_read_unlock(); free_cpumask_var(new_mask); free_cpumask_var(new_mask); } } static int static int __sched_setaffinity(struct task_struct *p, struct affinity_co __sched_setaffinity(struct task_struct *p, struct affinity_co /* /* * Restore the affinity of a task @p which was previously res * Restore the affinity of a task @p which was previously res * call to force_compatible_cpus_allowed_ptr(). * call to force_compatible_cpus_allowed_ptr(). * * * It is the caller's responsibility to serialise this with a * It is the caller's responsibility to serialise this with a * force_compatible_cpus_allowed_ptr(@p). * force_compatible_cpus_allowed_ptr(@p). */ */ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) void relax_compatible_cpus_allowed_ptr(struct task_struct *p) { { struct affinity_context ac = { struct affinity_context ac = { .new_mask = task_user_cpus(p), .new_mask = task_user_cpus(p), .flags = 0, .flags = 0, }; }; int ret; int ret; /* /* * Try to restore the old affinity mask with __sched_ * Try to restore the old affinity mask with __sched_ * Cpuset masking will be done there too. * Cpuset masking will be done there too. */ */ ret = __sched_setaffinity(p, &ac); ret = __sched_setaffinity(p, &ac); WARN_ON_ONCE(ret); WARN_ON_ONCE(ret); } } void set_task_cpu(struct task_struct *p, unsigned int new_cpu void set_task_cpu(struct task_struct *p, unsigned int new_cpu { { #ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG unsigned int state = READ_ONCE(p->__state); unsigned int state = READ_ONCE(p->__state); /* /* * We should never call set_task_cpu() on a blocked t * We should never call set_task_cpu() on a blocked t * ttwu() will sort out the placement. * ttwu() will sort out the placement. */ */ WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_W WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_W /* /* * Migrating fair class task must have p->on_rq = TAS * Migrating fair class task must have p->on_rq = TAS * because schedstat_wait_{start,end} rebase migratin * because schedstat_wait_{start,end} rebase migratin * time relying on p->on_rq. * time relying on p->on_rq. */ */ WARN_ON_ONCE(state == TASK_RUNNING && WARN_ON_ONCE(state == TASK_RUNNING && p->sched_class == &fair_sched_class && p->sched_class == &fair_sched_class && (p->on_rq && !task_on_rq_migrating(p))); (p->on_rq && !task_on_rq_migrating(p))); #ifdef CONFIG_LOCKDEP #ifdef CONFIG_LOCKDEP /* /* * The caller should hold either p->pi_lock or rq->lo * The caller should hold either p->pi_lock or rq->lo * a task's CPU. ->pi_lock for waking tasks, rq->lock * a task's CPU. ->pi_lock for waking tasks, rq->lock * * * sched_move_task() holds both and thus holding eith * sched_move_task() holds both and thus holding eith * see task_group(). * see task_group(). * * * Furthermore, all task_rq users should acquire both * Furthermore, all task_rq users should acquire both * task_rq_lock(). * task_rq_lock(). */ */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_ lockdep_is_held(__rq_lo lockdep_is_held(__rq_lo #endif #endif /* /* * Clearly, migrating tasks to offline CPUs is a fair * Clearly, migrating tasks to offline CPUs is a fair */ */ WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(!cpu_online(new_cpu)); WARN_ON_ONCE(is_migration_disabled(p)); WARN_ON_ONCE(is_migration_disabled(p)); #endif #endif trace_sched_migrate_task(p, new_cpu); trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { if (task_cpu(p) != new_cpu) { if (p->sched_class->migrate_task_rq) if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, ne p->sched_class->migrate_task_rq(p, ne p->se.nr_migrations++; p->se.nr_migrations++; rseq_migrate(p); rseq_migrate(p); sched_mm_cid_migrate_from(p); sched_mm_cid_migrate_from(p); perf_event_task_migrate(p); perf_event_task_migrate(p); } } __set_task_cpu(p, new_cpu); __set_task_cpu(p, new_cpu); } } #ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cp static void __migrate_swap_task(struct task_struct *p, int cp { { if (task_on_rq_queued(p)) { if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; struct rq *src_rq, *dst_rq; struct rq_flags srf, drf; struct rq_flags srf, drf; src_rq = task_rq(p); src_rq = task_rq(p); dst_rq = cpu_rq(cpu); dst_rq = cpu_rq(cpu); rq_pin_lock(src_rq, &srf); rq_pin_lock(src_rq, &srf); rq_pin_lock(dst_rq, &drf); rq_pin_lock(dst_rq, &drf); deactivate_task(src_rq, p, 0); deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); activate_task(dst_rq, p, 0); check_preempt_curr(dst_rq, p, 0); check_preempt_curr(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(src_rq, &srf); rq_unpin_lock(src_rq, &srf); } else { } else { /* /* * Task isn't running anymore; make it appear * Task isn't running anymore; make it appear * it before it went to sleep. This means on * it before it went to sleep. This means on * previous CPU our target instead of where i * previous CPU our target instead of where i */ */ p->wake_cpu = cpu; p->wake_cpu = cpu; } } } } struct migration_swap_arg { struct migration_swap_arg { struct task_struct *src_task, *dst_task; struct task_struct *src_task, *dst_task; int src_cpu, dst_cpu; int src_cpu, dst_cpu; }; }; static int migrate_swap_stop(void *data) static int migrate_swap_stop(void *data) { { struct migration_swap_arg *arg = data; struct migration_swap_arg *arg = data; struct rq *src_rq, *dst_rq; struct rq *src_rq, *dst_rq; if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst return -EAGAIN; return -EAGAIN; src_rq = cpu_rq(arg->src_cpu); src_rq = cpu_rq(arg->src_cpu); dst_rq = cpu_rq(arg->dst_cpu); dst_rq = cpu_rq(arg->dst_cpu); guard(double_raw_spinlock)(&arg->src_task->pi_lock, & guard(double_raw_spinlock)(&arg->src_task->pi_lock, & guard(double_rq_lock)(src_rq, dst_rq); guard(double_rq_lock)(src_rq, dst_rq); if (task_cpu(arg->dst_task) != arg->dst_cpu) if (task_cpu(arg->dst_task) != arg->dst_cpu) return -EAGAIN; return -EAGAIN; if (task_cpu(arg->src_task) != arg->src_cpu) if (task_cpu(arg->src_task) != arg->src_cpu) return -EAGAIN; return -EAGAIN; if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cp if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cp return -EAGAIN; return -EAGAIN; if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cp if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cp return -EAGAIN; return -EAGAIN; __migrate_swap_task(arg->src_task, arg->dst_cpu); __migrate_swap_task(arg->src_task, arg->dst_cpu); __migrate_swap_task(arg->dst_task, arg->src_cpu); __migrate_swap_task(arg->dst_task, arg->src_cpu); return 0; return 0; } } /* /* * Cross migrate two tasks * Cross migrate two tasks */ */ int migrate_swap(struct task_struct *cur, struct task_struct int migrate_swap(struct task_struct *cur, struct task_struct int target_cpu, int curr_cpu) int target_cpu, int curr_cpu) { { struct migration_swap_arg arg; struct migration_swap_arg arg; int ret = -EINVAL; int ret = -EINVAL; arg = (struct migration_swap_arg){ arg = (struct migration_swap_arg){ .src_task = cur, .src_task = cur, .src_cpu = curr_cpu, .src_cpu = curr_cpu, .dst_task = p, .dst_task = p, .dst_cpu = target_cpu, .dst_cpu = target_cpu, }; }; if (arg.src_cpu == arg.dst_cpu) if (arg.src_cpu == arg.dst_cpu) goto out; goto out; /* /* * These three tests are all lockless; this is OK sin * These three tests are all lockless; this is OK sin * will be re-checked with proper locks held further * will be re-checked with proper locks held further */ */ if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_c if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_c goto out; goto out; if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus goto out; goto out; if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus goto out; goto out; trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cp trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cp ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate out: out: return ret; return ret; } } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */ /*** /*** * kick_process - kick a running thread to enter/exit the ker * kick_process - kick a running thread to enter/exit the ker * @p: the to-be-kicked thread * @p: the to-be-kicked thread * * * Cause a process which is running on another CPU to enter * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) * kernel-mode, without any delay. (to get signals handled.) * * * NOTE: this function doesn't have to take the runqueue lock * NOTE: this function doesn't have to take the runqueue lock * because all it wants to ensure is that the remote task ent * because all it wants to ensure is that the remote task ent * the kernel. If the IPI races and the task has been migrate * the kernel. If the IPI races and the task has been migrate * to another CPU then no harm is done and the purpose has be * to another CPU then no harm is done and the purpose has be * achieved as well. * achieved as well. */ */ void kick_process(struct task_struct *p) void kick_process(struct task_struct *p) { { int cpu; int cpu; preempt_disable(); preempt_disable(); cpu = task_cpu(p); cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); smp_send_reschedule(cpu); preempt_enable(); preempt_enable(); } } EXPORT_SYMBOL_GPL(kick_process); EXPORT_SYMBOL_GPL(kick_process); /* /* * ->cpus_ptr is protected by both rq->lock and p->pi_lock * ->cpus_ptr is protected by both rq->lock and p->pi_lock * * * A few notes on cpu_active vs cpu_online: * A few notes on cpu_active vs cpu_online: * * * - cpu_active must be a subset of cpu_online * - cpu_active must be a subset of cpu_online * * * - on CPU-up we allow per-CPU kthreads on the online && !a * - on CPU-up we allow per-CPU kthreads on the online && !a * see __set_cpus_allowed_ptr(). At this point the newly o * see __set_cpus_allowed_ptr(). At this point the newly o * CPU isn't yet part of the sched domains, and balancing * CPU isn't yet part of the sched domains, and balancing * see it. * see it. * * * - on CPU-down we clear cpu_active() to mask the sched dom * - on CPU-down we clear cpu_active() to mask the sched dom * avoid the load balancer to place new tasks on the to be * avoid the load balancer to place new tasks on the to be * CPU. Existing tasks will remain running there and will * CPU. Existing tasks will remain running there and will * off. * off. * * * This means that fallback selection must not select !active * This means that fallback selection must not select !active * And can assume that any active CPU must be online. Convers * And can assume that any active CPU must be online. Convers * select_task_rq() below may allow selection of !active CPUs * select_task_rq() below may allow selection of !active CPUs * to satisfy the above rules. * to satisfy the above rules. */ */ static int select_fallback_rq(int cpu, struct task_struct *p) static int select_fallback_rq(int cpu, struct task_struct *p) { { int nid = cpu_to_node(cpu); int nid = cpu_to_node(cpu); const struct cpumask *nodemask = NULL; const struct cpumask *nodemask = NULL; enum { cpuset, possible, fail } state = cpuset; enum { cpuset, possible, fail } state = cpuset; int dest_cpu; int dest_cpu; /* /* * If the node that the CPU is on has been offlined, * If the node that the CPU is on has been offlined, * will return -1. There is no CPU on the node, and w * will return -1. There is no CPU on the node, and w * select the CPU on the other node. * select the CPU on the other node. */ */ if (nid != -1) { if (nid != -1) { nodemask = cpumask_of_node(nid); nodemask = cpumask_of_node(nid); /* Look for allowed, online CPU in same node. /* Look for allowed, online CPU in same node. for_each_cpu(dest_cpu, nodemask) { for_each_cpu(dest_cpu, nodemask) { if (is_cpu_allowed(p, dest_cpu)) if (is_cpu_allowed(p, dest_cpu)) return dest_cpu; return dest_cpu; } } } } for (;;) { for (;;) { /* Any allowed, online CPU? */ /* Any allowed, online CPU? */ for_each_cpu(dest_cpu, p->cpus_ptr) { for_each_cpu(dest_cpu, p->cpus_ptr) { if (!is_cpu_allowed(p, dest_cpu)) if (!is_cpu_allowed(p, dest_cpu)) continue; continue; goto out; goto out; } } /* No more Mr. Nice Guy. */ /* No more Mr. Nice Guy. */ switch (state) { switch (state) { case cpuset: case cpuset: if (cpuset_cpus_allowed_fallback(p)) if (cpuset_cpus_allowed_fallback(p)) state = possible; state = possible; break; break; } } fallthrough; fallthrough; case possible: case possible: /* /* * XXX When called from select_task_r * XXX When called from select_task_r * hold p->pi_lock and again violate * hold p->pi_lock and again violate * * * More yuck to audit. * More yuck to audit. */ */ do_set_cpus_allowed(p, task_cpu_possi do_set_cpus_allowed(p, task_cpu_possi state = fail; state = fail; break; break; case fail: case fail: BUG(); BUG(); break; break; } } } } out: out: if (state != cpuset) { if (state != cpuset) { /* /* * Don't tell them about moving exiting tasks * Don't tell them about moving exiting tasks * kernel threads (both mm NULL), since they * kernel threads (both mm NULL), since they * leave kernel. * leave kernel. */ */ if (p->mm && printk_ratelimit()) { if (p->mm && printk_ratelimit()) { printk_deferred("process %d (%s) no l printk_deferred("process %d (%s) no l task_pid_nr(p), p->co task_pid_nr(p), p->co } } } } return dest_cpu; return dest_cpu; } } /* /* * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is s * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is s */ */ static inline static inline int select_task_rq(struct task_struct *p, int cpu, int wake_f int select_task_rq(struct task_struct *p, int cpu, int wake_f { { lockdep_assert_held(&p->pi_lock); lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1 && !is_migration_disabled( if (p->nr_cpus_allowed > 1 && !is_migration_disabled( cpu = p->sched_class->select_task_rq(p, cpu, cpu = p->sched_class->select_task_rq(p, cpu, else else cpu = cpumask_any(p->cpus_ptr); cpu = cpumask_any(p->cpus_ptr); /* /* * In order not to call set_task_cpu() on a blocking * In order not to call set_task_cpu() on a blocking * to rely on ttwu() to place the task on a valid ->c * to rely on ttwu() to place the task on a valid ->c * CPU. * CPU. * * * Since this is common to all placement strategies, * Since this is common to all placement strategies, * * * [ this allows ->select_task() to simply return tas * [ this allows ->select_task() to simply return tas * not worry about this generic constraint ] * not worry about this generic constraint ] */ */ if (unlikely(!is_cpu_allowed(p, cpu))) if (unlikely(!is_cpu_allowed(p, cpu))) cpu = select_fallback_rq(task_cpu(p), p); cpu = select_fallback_rq(task_cpu(p), p); return cpu; return cpu; } } void sched_set_stop_task(int cpu, struct task_struct *stop) void sched_set_stop_task(int cpu, struct task_struct *stop) { { static struct lock_class_key stop_pi_lock; static struct lock_class_key stop_pi_lock; struct sched_param param = { .sched_priority = MAX_RT struct sched_param param = { .sched_priority = MAX_RT struct task_struct *old_stop = cpu_rq(cpu)->stop; struct task_struct *old_stop = cpu_rq(cpu)->stop; if (stop) { if (stop) { /* /* * Make it appear like a SCHED_FIFO task, its * Make it appear like a SCHED_FIFO task, its * userspace knows about and won't get confus * userspace knows about and won't get confus * * * Also, it will make PI more or less work wi * Also, it will make PI more or less work wi * much confusion -- but then, stop work shou * much confusion -- but then, stop work shou * rely on PI working anyway. * rely on PI working anyway. */ */ sched_setscheduler_nocheck(stop, SCHED_FIFO, sched_setscheduler_nocheck(stop, SCHED_FIFO, stop->sched_class = &stop_sched_class; stop->sched_class = &stop_sched_class; /* /* * The PI code calls rt_mutex_setprio() with * The PI code calls rt_mutex_setprio() with * adjust the effective priority of a task. A * adjust the effective priority of a task. A * rt_mutex_setprio() can trigger (RT) balanc * rt_mutex_setprio() can trigger (RT) balanc * which can then trigger wakeups of the stop * which can then trigger wakeups of the stop * around the current task. * around the current task. * * * The stop task itself will never be part of * The stop task itself will never be part of * never blocks, therefore that ->pi_lock rec * never blocks, therefore that ->pi_lock rec * Tell lockdep about this by placing the sto * Tell lockdep about this by placing the sto * own class. * own class. */ */ lockdep_set_class(&stop->pi_lock, &stop_pi_lo lockdep_set_class(&stop->pi_lock, &stop_pi_lo } } cpu_rq(cpu)->stop = stop; cpu_rq(cpu)->stop = stop; if (old_stop) { if (old_stop) { /* /* * Reset it back to a normal scheduling class * Reset it back to a normal scheduling class * it can die in pieces. * it can die in pieces. */ */ old_stop->sched_class = &rt_sched_class; old_stop->sched_class = &rt_sched_class; } } } } #else /* CONFIG_SMP */ #else /* CONFIG_SMP */ static inline int __set_cpus_allowed_ptr(struct task_struct * static inline int __set_cpus_allowed_ptr(struct task_struct * struct affinity_cont struct affinity_cont { { return set_cpus_allowed_ptr(p, ctx->new_mask); return set_cpus_allowed_ptr(p, ctx->new_mask); } } static inline void migrate_disable_switch(struct rq *rq, stru static inline void migrate_disable_switch(struct rq *rq, stru static inline bool rq_has_pinned_tasks(struct rq *rq) static inline bool rq_has_pinned_tasks(struct rq *rq) { { return false; return false; } } static inline cpumask_t *alloc_user_cpus_ptr(int node) static inline cpumask_t *alloc_user_cpus_ptr(int node) { { return NULL; return NULL; } } #endif /* !CONFIG_SMP */ #endif /* !CONFIG_SMP */ static void static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { { struct rq *rq; struct rq *rq; if (!schedstat_enabled()) if (!schedstat_enabled()) return; return; rq = this_rq(); rq = this_rq(); #ifdef CONFIG_SMP #ifdef CONFIG_SMP if (cpu == rq->cpu) { if (cpu == rq->cpu) { __schedstat_inc(rq->ttwu_local); __schedstat_inc(rq->ttwu_local); __schedstat_inc(p->stats.nr_wakeups_local); __schedstat_inc(p->stats.nr_wakeups_local); } else { } else { struct sched_domain *sd; struct sched_domain *sd; __schedstat_inc(p->stats.nr_wakeups_remote); __schedstat_inc(p->stats.nr_wakeups_remote); guard(rcu)(); guard(rcu)(); for_each_domain(rq->cpu, sd) { for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domai if (cpumask_test_cpu(cpu, sched_domai __schedstat_inc(sd->ttwu_wake __schedstat_inc(sd->ttwu_wake break; break; } } } } } } if (wake_flags & WF_MIGRATED) if (wake_flags & WF_MIGRATED) __schedstat_inc(p->stats.nr_wakeups_migrate); __schedstat_inc(p->stats.nr_wakeups_migrate); #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ __schedstat_inc(rq->ttwu_count); __schedstat_inc(rq->ttwu_count); __schedstat_inc(p->stats.nr_wakeups); __schedstat_inc(p->stats.nr_wakeups); if (wake_flags & WF_SYNC) if (wake_flags & WF_SYNC) __schedstat_inc(p->stats.nr_wakeups_sync); __schedstat_inc(p->stats.nr_wakeups_sync); } } /* /* * Mark the task runnable. * Mark the task runnable. */ */ static inline void ttwu_do_wakeup(struct task_struct *p) static inline void ttwu_do_wakeup(struct task_struct *p) { { WRITE_ONCE(p->__state, TASK_RUNNING); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); trace_sched_wakeup(p); } } static void static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wa ttwu_do_activate(struct rq *rq, struct task_struct *p, int wa struct rq_flags *rf) struct rq_flags *rf) { { int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); if (p->sched_contributes_to_load) if (p->sched_contributes_to_load) rq->nr_uninterruptible--; rq->nr_uninterruptible--; #ifdef CONFIG_SMP #ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; en_flags |= ENQUEUE_MIGRATED; else else #endif #endif if (p->in_iowait) { if (p->in_iowait) { delayacct_blkio_end(p); delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); atomic_dec(&task_rq(p)->nr_iowait); } } activate_task(rq, p, en_flags); activate_task(rq, p, en_flags); check_preempt_curr(rq, p, wake_flags); check_preempt_curr(rq, p, wake_flags); ttwu_do_wakeup(p); ttwu_do_wakeup(p); #ifdef CONFIG_SMP #ifdef CONFIG_SMP if (p->sched_class->task_woken) { if (p->sched_class->task_woken) { /* /* * Our task @p is fully woken up and running; * Our task @p is fully woken up and running; * drop the rq->lock, hereafter rq is only us * drop the rq->lock, hereafter rq is only us */ */ rq_unpin_lock(rq, rf); rq_unpin_lock(rq, rf); p->sched_class->task_woken(rq, p); p->sched_class->task_woken(rq, p); rq_repin_lock(rq, rf); rq_repin_lock(rq, rf); } } if (rq->idle_stamp) { if (rq->idle_stamp) { u64 delta = rq_clock(rq) - rq->idle_stamp; u64 delta = rq_clock(rq) - rq->idle_stamp; u64 max = 2*rq->max_idle_balance_cost; u64 max = 2*rq->max_idle_balance_cost; update_avg(&rq->avg_idle, delta); update_avg(&rq->avg_idle, delta); if (rq->avg_idle > max) if (rq->avg_idle > max) rq->avg_idle = max; rq->avg_idle = max; rq->wake_stamp = jiffies; rq->wake_stamp = jiffies; rq->wake_avg_idle = rq->avg_idle / 2; rq->wake_avg_idle = rq->avg_idle / 2; rq->idle_stamp = 0; rq->idle_stamp = 0; } } #endif #endif } } /* /* * Consider @p being inside a wait loop: * Consider @p being inside a wait loop: * * * for (;;) { * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * set_current_state(TASK_UNINTERRUPTIBLE); * * * if (CONDITION) * if (CONDITION) * break; * break; * * * schedule(); * schedule(); * } * } * __set_current_state(TASK_RUNNING); * __set_current_state(TASK_RUNNING); * * * between set_current_state() and schedule(). In this case @ * between set_current_state() and schedule(). In this case @ * runnable, so all that needs doing is change p->state back * runnable, so all that needs doing is change p->state back * an atomic manner. * an atomic manner. * * * By taking task_rq(p)->lock we serialize against schedule() * By taking task_rq(p)->lock we serialize against schedule() * then schedule() must still happen and p->state can be chan * then schedule() must still happen and p->state can be chan * TASK_RUNNING. Otherwise we lost the race, schedule() has h * TASK_RUNNING. Otherwise we lost the race, schedule() has h * need to do a full wakeup with enqueue. * need to do a full wakeup with enqueue. * * * Returns: %true when the wakeup is done, * Returns: %true when the wakeup is done, * %false otherwise. * %false otherwise. */ */ static int ttwu_runnable(struct task_struct *p, int wake_flag static int ttwu_runnable(struct task_struct *p, int wake_flag { { struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; int ret = 0; int ret = 0; rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { if (task_on_rq_queued(p)) { if (!task_on_cpu(rq, p)) { if (!task_on_cpu(rq, p)) { /* /* * When on_rq && !on_cpu the task is * When on_rq && !on_cpu the task is * it should preempt the task that is * it should preempt the task that is */ */ update_rq_clock(rq); update_rq_clock(rq); check_preempt_curr(rq, p, wake_flags) check_preempt_curr(rq, p, wake_flags) } } ttwu_do_wakeup(p); ttwu_do_wakeup(p); ret = 1; ret = 1; } } __task_rq_unlock(rq, &rf); __task_rq_unlock(rq, &rf); return ret; return ret; } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP void sched_ttwu_pending(void *arg) void sched_ttwu_pending(void *arg) { { struct llist_node *llist = arg; struct llist_node *llist = arg; struct rq *rq = this_rq(); struct rq *rq = this_rq(); struct task_struct *p, *t; struct task_struct *p, *t; struct rq_flags rf; struct rq_flags rf; if (!llist) if (!llist) return; return; rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_rq_clock(rq); llist_for_each_entry_safe(p, t, llist, wake_entry.lli llist_for_each_entry_safe(p, t, llist, wake_entry.lli if (WARN_ON_ONCE(p->on_cpu)) if (WARN_ON_ONCE(p->on_cpu)) smp_cond_load_acquire(&p->on_cpu, !VA smp_cond_load_acquire(&p->on_cpu, !VA if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) set_task_cpu(p, cpu_of(rq)); set_task_cpu(p, cpu_of(rq)); ttwu_do_activate(rq, p, p->sched_remote_wakeu ttwu_do_activate(rq, p, p->sched_remote_wakeu } } /* /* * Must be after enqueueing at least once task such t * Must be after enqueueing at least once task such t * idle_cpu() does not observe a false-negative -- if * idle_cpu() does not observe a false-negative -- if * it is possible for select_idle_siblings() to stack * it is possible for select_idle_siblings() to stack * of tasks on this CPU during that window. * of tasks on this CPU during that window. * * * It is ok to clear ttwu_pending when another task p * It is ok to clear ttwu_pending when another task p * We will receive IPI after local irq enabled and th * We will receive IPI after local irq enabled and th * Since now nr_running > 0, idle_cpu() will always g * Since now nr_running > 0, idle_cpu() will always g */ */ WRITE_ONCE(rq->ttwu_pending, 0); WRITE_ONCE(rq->ttwu_pending, 0); rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); } } /* /* * Prepare the scene for sending an IPI for a remote smp_call * Prepare the scene for sending an IPI for a remote smp_call * * * Returns true if the caller can proceed with sending the IP * Returns true if the caller can proceed with sending the IP * Returns false otherwise. * Returns false otherwise. */ */ bool call_function_single_prep_ipi(int cpu) bool call_function_single_prep_ipi(int cpu) { { if (set_nr_if_polling(cpu_rq(cpu)->idle)) { if (set_nr_if_polling(cpu_rq(cpu)->idle)) { trace_sched_wake_idle_without_ipi(cpu); trace_sched_wake_idle_without_ipi(cpu); return false; return false; } } return true; return true; } } /* /* * Queue a task on the target CPUs wake_list and wake the CPU * Queue a task on the target CPUs wake_list and wake the CPU * necessary. The wakee CPU on receipt of the IPI will queue * necessary. The wakee CPU on receipt of the IPI will queue * via sched_ttwu_wakeup() for activation so the wakee incurs * via sched_ttwu_wakeup() for activation so the wakee incurs * of the wakeup instead of the waker. * of the wakeup instead of the waker. */ */ static void __ttwu_queue_wakelist(struct task_struct *p, int static void __ttwu_queue_wakelist(struct task_struct *p, int { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED) p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED) WRITE_ONCE(rq->ttwu_pending, 1); WRITE_ONCE(rq->ttwu_pending, 1); __smp_call_single_queue(cpu, &p->wake_entry.llist); __smp_call_single_queue(cpu, &p->wake_entry.llist); } } void wake_up_if_idle(int cpu) void wake_up_if_idle(int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); guard(rcu)(); guard(rcu)(); if (is_idle_task(rcu_dereference(rq->curr))) { if (is_idle_task(rcu_dereference(rq->curr))) { guard(rq_lock_irqsave)(rq); guard(rq_lock_irqsave)(rq); if (is_idle_task(rq->curr)) if (is_idle_task(rq->curr)) resched_curr(rq); resched_curr(rq); } } } } bool cpus_share_cache(int this_cpu, int that_cpu) bool cpus_share_cache(int this_cpu, int that_cpu) { { if (this_cpu == that_cpu) if (this_cpu == that_cpu) return true; return true; return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc } } static inline bool ttwu_queue_cond(struct task_struct *p, int static inline bool ttwu_queue_cond(struct task_struct *p, int { { /* /* * Do not complicate things with the async wake_list * Do not complicate things with the async wake_list * in hotplug state. * in hotplug state. */ */ if (!cpu_active(cpu)) if (!cpu_active(cpu)) return false; return false; /* Ensure the task will still be allowed to run on th /* Ensure the task will still be allowed to run on th if (!cpumask_test_cpu(cpu, p->cpus_ptr)) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; return false; /* /* * If the CPU does not share cache, then queue the ta * If the CPU does not share cache, then queue the ta * remote rqs wakelist to avoid accessing remote data * remote rqs wakelist to avoid accessing remote data */ */ if (!cpus_share_cache(smp_processor_id(), cpu)) if (!cpus_share_cache(smp_processor_id(), cpu)) return true; return true; if (cpu == smp_processor_id()) if (cpu == smp_processor_id()) return false; return false; /* /* * If the wakee cpu is idle, or the task is deschedul * If the wakee cpu is idle, or the task is deschedul * only running task on the CPU, then use the wakelis * only running task on the CPU, then use the wakelis * the task activation to the idle (or soon-to-be-idl * the task activation to the idle (or soon-to-be-idl * the current CPU is likely busy. nr_running is chec * the current CPU is likely busy. nr_running is chec * avoid unnecessary task stacking. * avoid unnecessary task stacking. * * * Note that we can only get here with (wakee) p->on_ * Note that we can only get here with (wakee) p->on_ * p->on_cpu can be whatever, we've done the dequeue, * p->on_cpu can be whatever, we've done the dequeue, * the wakee has been accounted out of ->nr_running. * the wakee has been accounted out of ->nr_running. */ */ if (!cpu_rq(cpu)->nr_running) if (!cpu_rq(cpu)->nr_running) return true; return true; return false; return false; } } static bool ttwu_queue_wakelist(struct task_struct *p, int cp static bool ttwu_queue_wakelist(struct task_struct *p, int cp { { if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu) if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu) sched_clock_cpu(cpu); /* Sync clocks across C sched_clock_cpu(cpu); /* Sync clocks across C __ttwu_queue_wakelist(p, cpu, wake_flags); __ttwu_queue_wakelist(p, cpu, wake_flags); return true; return true; } } return false; return false; } } #else /* !CONFIG_SMP */ #else /* !CONFIG_SMP */ static inline bool ttwu_queue_wakelist(struct task_struct *p, static inline bool ttwu_queue_wakelist(struct task_struct *p, { { return false; return false; } } #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ static void ttwu_queue(struct task_struct *p, int cpu, int wa static void ttwu_queue(struct task_struct *p, int cpu, int wa { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct rq_flags rf; struct rq_flags rf; if (ttwu_queue_wakelist(p, cpu, wake_flags)) if (ttwu_queue_wakelist(p, cpu, wake_flags)) return; return; rq_lock(rq, &rf); rq_lock(rq, &rf); update_rq_clock(rq); update_rq_clock(rq); ttwu_do_activate(rq, p, wake_flags, &rf); ttwu_do_activate(rq, p, wake_flags, &rf); rq_unlock(rq, &rf); rq_unlock(rq, &rf); } } /* /* * Invoked from try_to_wake_up() to check whether the task ca * Invoked from try_to_wake_up() to check whether the task ca * * * The caller holds p::pi_lock if p != current or has preempt * The caller holds p::pi_lock if p != current or has preempt * disabled when p == current. * disabled when p == current. * * * The rules of PREEMPT_RT saved_state: * The rules of PREEMPT_RT saved_state: * * * The related locking code always holds p::pi_lock when up * The related locking code always holds p::pi_lock when up * p::saved_state, which means the code is fully serialized * p::saved_state, which means the code is fully serialized * * * The lock wait and lock wakeups happen via TASK_RTLOCK_WA * The lock wait and lock wakeups happen via TASK_RTLOCK_WA * bits set. This allows to distinguish all wakeup scenario * bits set. This allows to distinguish all wakeup scenario */ */ static __always_inline static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int sta bool ttwu_state_match(struct task_struct *p, unsigned int sta { { int match; int match; if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && state != TASK_RTLOCK_WAIT); state != TASK_RTLOCK_WAIT); } } *success = !!(match = __task_state_match(p, state)); *success = !!(match = __task_state_match(p, state)); #ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT /* /* * Saved state preserves the task state across blocki * Saved state preserves the task state across blocki * an RT lock. If the state matches, set p::saved_st * an RT lock. If the state matches, set p::saved_st * TASK_RUNNING, but do not wake the task because it * TASK_RUNNING, but do not wake the task because it * for a lock wakeup. Also indicate success because f * for a lock wakeup. Also indicate success because f * the regular waker's point of view this has succeed * the regular waker's point of view this has succeed * * * After acquiring the lock the task will restore p:: * After acquiring the lock the task will restore p:: * from p::saved_state which ensures that the regular * from p::saved_state which ensures that the regular * wakeup is not lost. The restore will also set * wakeup is not lost. The restore will also set * p::saved_state to TASK_RUNNING so any further test * p::saved_state to TASK_RUNNING so any further test * not result in false positives vs. @success * not result in false positives vs. @success */ */ if (match < 0) if (match < 0) p->saved_state = TASK_RUNNING; p->saved_state = TASK_RUNNING; #endif #endif return match > 0; return match > 0; } } /* /* * Notes on Program-Order guarantees on SMP systems. * Notes on Program-Order guarantees on SMP systems. * * * MIGRATION * MIGRATION * * * The basic program-order guarantee on SMP systems is that w * The basic program-order guarantee on SMP systems is that w * migrates, all its activity on its old CPU [c0] happens-bef * migrates, all its activity on its old CPU [c0] happens-bef * execution on its new CPU [c1]. * execution on its new CPU [c1]. * * * For migration (of runnable tasks) this is provided by the * For migration (of runnable tasks) this is provided by the * * * A) UNLOCK of the rq(c0)->lock scheduling out task t * A) UNLOCK of the rq(c0)->lock scheduling out task t * B) migration for t is required to synchronize *both* rq(c * B) migration for t is required to synchronize *both* rq(c * rq(c1)->lock (if not at the same time, then in that or * rq(c1)->lock (if not at the same time, then in that or * C) LOCK of the rq(c1)->lock scheduling in task * C) LOCK of the rq(c1)->lock scheduling in task * * * Release/acquire chaining guarantees that B happens after A * Release/acquire chaining guarantees that B happens after A * Note: the CPU doing B need not be c0 or c1 * Note: the CPU doing B need not be c0 or c1 * * * Example: * Example: * * * CPU0 CPU1 CPU2 * CPU0 CPU1 CPU2 * * * LOCK rq(0)->lock * LOCK rq(0)->lock * sched-out X * sched-out X * sched-in Y * sched-in Y * UNLOCK rq(0)->lock * UNLOCK rq(0)->lock * * * LOCK rq(0)->lock // orde * LOCK rq(0)->lock // orde * dequeue X * dequeue X * UNLOCK rq(0)->lock * UNLOCK rq(0)->lock * * * LOCK rq(1)->lock * LOCK rq(1)->lock * enqueue X * enqueue X * UNLOCK rq(1)->lock * UNLOCK rq(1)->lock * * * LOCK rq(1)->lock // orders against CPU2 * LOCK rq(1)->lock // orders against CPU2 * sched-out Z * sched-out Z * sched-in X * sched-in X * UNLOCK rq(1)->lock * UNLOCK rq(1)->lock * * * * * BLOCKING -- aka. SLEEP + WAKEUP * BLOCKING -- aka. SLEEP + WAKEUP * * * For blocking we (obviously) need to provide the same guara * For blocking we (obviously) need to provide the same guara * migration. However the means are completely different as t * migration. However the means are completely different as t * chain to provide order. Instead we do: * chain to provide order. Instead we do: * * * 1) smp_store_release(X->on_cpu, 0) -- finish_task() * 1) smp_store_release(X->on_cpu, 0) -- finish_task() * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() * * * Example: * Example: * * * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) * * * LOCK rq(0)->lock LOCK X->pi_lock * LOCK rq(0)->lock LOCK X->pi_lock * dequeue X * dequeue X * sched-out X * sched-out X * smp_store_release(X->on_cpu, 0); * smp_store_release(X->on_cpu, 0); * * * smp_cond_load_acquire(&X->on_cpu, !VAL) * smp_cond_load_acquire(&X->on_cpu, !VAL) * X->state = WAKING * X->state = WAKING * set_task_cpu(X,2) * set_task_cpu(X,2) * * * LOCK rq(2)->lock * LOCK rq(2)->lock * enqueue X * enqueue X * X->state = RUNNING * X->state = RUNNING * UNLOCK rq(2)->lock * UNLOCK rq(2)->lock * * * LOCK rq(2)->lock * LOCK rq(2)->lock * sched-out Z * sched-out Z * sched-in X * sched-in X * UNLOCK rq(2)->loc * UNLOCK rq(2)->loc * * * UNLOCK X->pi_lock * UNLOCK X->pi_lock * UNLOCK rq(0)->lock * UNLOCK rq(0)->lock * * * * * However, for wakeups there is a second guarantee we must p * However, for wakeups there is a second guarantee we must p * must ensure that CONDITION=1 done by the caller can not be * must ensure that CONDITION=1 done by the caller can not be * accesses to the task state; see try_to_wake_up() and set_c * accesses to the task state; see try_to_wake_up() and set_c */ */ /** /** * try_to_wake_up - wake up a thread * try_to_wake_up - wake up a thread * @p: the thread to be awakened * @p: the thread to be awakened * @state: the mask of task states that can be woken * @state: the mask of task states that can be woken * @wake_flags: wake modifier flags (WF_*) * @wake_flags: wake modifier flags (WF_*) * * * Conceptually does: * Conceptually does: * * * If (@state & @p->state) @p->state = TASK_RUNNING. * If (@state & @p->state) @p->state = TASK_RUNNING. * * * If the task was not queued/runnable, also place it back on * If the task was not queued/runnable, also place it back on * * * This function is atomic against schedule() which would deq * This function is atomic against schedule() which would deq * * * It issues a full memory barrier before accessing @p->state * It issues a full memory barrier before accessing @p->state * with set_current_state(). * with set_current_state(). * * * Uses p->pi_lock to serialize against concurrent wake-ups. * Uses p->pi_lock to serialize against concurrent wake-ups. * * * Relies on p->pi_lock stabilizing: * Relies on p->pi_lock stabilizing: * - p->sched_class * - p->sched_class * - p->cpus_ptr * - p->cpus_ptr * - p->sched_task_group * - p->sched_task_group * in order to do migration, see its use of select_task_rq()/ * in order to do migration, see its use of select_task_rq()/ * * * Tries really hard to only take one task_rq(p)->lock for pe * Tries really hard to only take one task_rq(p)->lock for pe * Takes rq->lock in: * Takes rq->lock in: * - ttwu_runnable() -- old rq, unavoidable, see comment * - ttwu_runnable() -- old rq, unavoidable, see comment * - ttwu_queue() -- new rq, for enqueue of the task; * - ttwu_queue() -- new rq, for enqueue of the task; * - psi_ttwu_dequeue() -- much sadness :-( accounting will * - psi_ttwu_dequeue() -- much sadness :-( accounting will * * * As a consequence we race really badly with just about ever * As a consequence we race really badly with just about ever * many memory barriers and their comments for details. * many memory barriers and their comments for details. * * * Return: %true if @p->state changes (an actual wakeup was d * Return: %true if @p->state changes (an actual wakeup was d * %false otherwise. * %false otherwise. */ */ int try_to_wake_up(struct task_struct *p, unsigned int state, int try_to_wake_up(struct task_struct *p, unsigned int state, { { guard(preempt)(); guard(preempt)(); int cpu, success = 0; int cpu, success = 0; if (p == current) { if (p == current) { /* /* * We're waking current, this means 'p->on_rq * We're waking current, this means 'p->on_rq * == smp_processor_id()'. Together this mean * == smp_processor_id()'. Together this mean * case the whole 'p->on_rq && ttwu_runnable( * case the whole 'p->on_rq && ttwu_runnable( * without taking any locks. * without taking any locks. * * * In particular: * In particular: * - we rely on Program-Order guarantees for * - we rely on Program-Order guarantees for * - we're serialized against set_special_st * - we're serialized against set_special_st * it disabling IRQs (this allows not taki * it disabling IRQs (this allows not taki */ */ if (!ttwu_state_match(p, state, &success)) if (!ttwu_state_match(p, state, &success)) goto out; goto out; trace_sched_waking(p); trace_sched_waking(p); ttwu_do_wakeup(p); ttwu_do_wakeup(p); goto out; goto out; } } /* /* * If we are going to wake up a thread waiting for CO * If we are going to wake up a thread waiting for CO * need to ensure that CONDITION=1 done by the caller * need to ensure that CONDITION=1 done by the caller * reordered with p->state check below. This pairs wi * reordered with p->state check below. This pairs wi * in set_current_state() that the waiting thread doe * in set_current_state() that the waiting thread doe */ */ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { smp_mb__after_spinlock(); smp_mb__after_spinlock(); if (!ttwu_state_match(p, state, &success)) if (!ttwu_state_match(p, state, &success)) break; break; trace_sched_waking(p); trace_sched_waking(p); /* /* * Ensure we load p->on_rq _after_ p->state, * Ensure we load p->on_rq _after_ p->state, * be possible to, falsely, observe p->on_rq * be possible to, falsely, observe p->on_rq * in smp_cond_load_acquire() below. * in smp_cond_load_acquire() below. * * * sched_ttwu_pending() try_t * sched_ttwu_pending() try_t * STORE p->on_rq = 1 LOA * STORE p->on_rq = 1 LOA * UNLOCK rq->lock * UNLOCK rq->lock * * * __schedule() (switch to task 'p') * __schedule() (switch to task 'p') * LOCK rq->lock smp * LOCK rq->lock smp * smp_mb__after_spinlock(); * smp_mb__after_spinlock(); * UNLOCK rq->lock * UNLOCK rq->lock * * * [task p] * [task p] * STORE p->state = UNINTERRUPTIBLE LOA * STORE p->state = UNINTERRUPTIBLE LOA * * * Pairs with the LOCK+smp_mb__after_spinlock * Pairs with the LOCK+smp_mb__after_spinlock * __schedule(). See the comment for smp_mb_ * __schedule(). See the comment for smp_mb_ * * * A similar smb_rmb() lives in try_invoke_on * A similar smb_rmb() lives in try_invoke_on */ */ smp_rmb(); smp_rmb(); if (READ_ONCE(p->on_rq) && ttwu_runnable(p, w if (READ_ONCE(p->on_rq) && ttwu_runnable(p, w break; break; #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * Ensure we load p->on_cpu _after_ p->on_rq, * Ensure we load p->on_cpu _after_ p->on_rq, * possible to, falsely, observe p->on_cpu == * possible to, falsely, observe p->on_cpu == * * * One must be running (->on_cpu == 1) in ord * One must be running (->on_cpu == 1) in ord * from the runqueue. * from the runqueue. * * * __schedule() (switch to task 'p') try_t * __schedule() (switch to task 'p') try_t * STORE p->on_cpu = 1 LOA * STORE p->on_cpu = 1 LOA * UNLOCK rq->lock * UNLOCK rq->lock * * * __schedule() (put 'p' to sleep) * __schedule() (put 'p' to sleep) * LOCK rq->lock smp * LOCK rq->lock smp * smp_mb__after_spinlock(); * smp_mb__after_spinlock(); * STORE p->on_rq = 0 LOA * STORE p->on_rq = 0 LOA * * * Pairs with the LOCK+smp_mb__after_spinlock * Pairs with the LOCK+smp_mb__after_spinlock * __schedule(). See the comment for smp_mb_ * __schedule(). See the comment for smp_mb_ * * * Form a control-dep-acquire with p->on_rq = * Form a control-dep-acquire with p->on_rq = * schedule()'s deactivate_task() has 'happen * schedule()'s deactivate_task() has 'happen * care about it's own p->state. See the comm * care about it's own p->state. See the comm */ */ smp_acquire__after_ctrl_dep(); smp_acquire__after_ctrl_dep(); /* /* * We're doing the wakeup (@success == 1), th * We're doing the wakeup (@success == 1), th * == 0), which means we need to do an enqueu * == 0), which means we need to do an enqueu * TASK_WAKING such that we can unlock p->pi_ * TASK_WAKING such that we can unlock p->pi_ * enqueue, such as ttwu_queue_wakelist(). * enqueue, such as ttwu_queue_wakelist(). */ */ WRITE_ONCE(p->__state, TASK_WAKING); WRITE_ONCE(p->__state, TASK_WAKING); /* /* * If the owning (remote) CPU is still in the * If the owning (remote) CPU is still in the * this task as prev, considering queueing p * this task as prev, considering queueing p * which potentially sends an IPI instead of * which potentially sends an IPI instead of * let the waker make forward progress. This * let the waker make forward progress. This * disabled and the IPI will deliver after on * disabled and the IPI will deliver after on * * * Ensure we load task_cpu(p) after p->on_cpu * Ensure we load task_cpu(p) after p->on_cpu * * * set_task_cpu(p, cpu); * set_task_cpu(p, cpu); * STORE p->cpu = @cpu * STORE p->cpu = @cpu * __schedule() (switch to task 'p') * __schedule() (switch to task 'p') * LOCK rq->lock * LOCK rq->lock * smp_mb__after_spin_lock() smp_c * smp_mb__after_spin_lock() smp_c * STORE p->on_cpu = 1 LOAD * STORE p->on_cpu = 1 LOAD * * * to ensure we observe the correct CPU on wh * to ensure we observe the correct CPU on wh * scheduling. * scheduling. */ */ if (smp_load_acquire(&p->on_cpu) && if (smp_load_acquire(&p->on_cpu) && ttwu_queue_wakelist(p, task_cpu(p), wake_ ttwu_queue_wakelist(p, task_cpu(p), wake_ break; break; /* /* * If the owning (remote) CPU is still in the * If the owning (remote) CPU is still in the * this task as prev, wait until it's done re * this task as prev, wait until it's done re * * * Pairs with the smp_store_release() in fini * Pairs with the smp_store_release() in fini * * * This ensures that tasks getting woken will * This ensures that tasks getting woken will * their previous state and preserve Program * their previous state and preserve Program */ */ smp_cond_load_acquire(&p->on_cpu, !VAL); smp_cond_load_acquire(&p->on_cpu, !VAL); cpu = select_task_rq(p, p->wake_cpu, wake_fla cpu = select_task_rq(p, p->wake_cpu, wake_fla if (task_cpu(p) != cpu) { if (task_cpu(p) != cpu) { if (p->in_iowait) { if (p->in_iowait) { delayacct_blkio_end(p); delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_io atomic_dec(&task_rq(p)->nr_io } } wake_flags |= WF_MIGRATED; wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); psi_ttwu_dequeue(p); set_task_cpu(p, cpu); set_task_cpu(p, cpu); } } #else #else cpu = task_cpu(p); cpu = task_cpu(p); #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); ttwu_queue(p, cpu, wake_flags); } } out: out: if (success) if (success) ttwu_stat(p, task_cpu(p), wake_flags); ttwu_stat(p, task_cpu(p), wake_flags); return success; return success; } } static bool __task_needs_rq_lock(struct task_struct *p) static bool __task_needs_rq_lock(struct task_struct *p) { { unsigned int state = READ_ONCE(p->__state); unsigned int state = READ_ONCE(p->__state); /* /* * Since pi->lock blocks try_to_wake_up(), we don't n * Since pi->lock blocks try_to_wake_up(), we don't n * the task is blocked. Make sure to check @state sin * the task is blocked. Make sure to check @state sin * locks at the end, see ttwu_queue_wakelist(). * locks at the end, see ttwu_queue_wakelist(). */ */ if (state == TASK_RUNNING || state == TASK_WAKING) if (state == TASK_RUNNING || state == TASK_WAKING) return true; return true; /* /* * Ensure we load p->on_rq after p->__state, otherwis * Ensure we load p->on_rq after p->__state, otherwis * possible to, falsely, observe p->on_rq == 0. * possible to, falsely, observe p->on_rq == 0. * * * See try_to_wake_up() for a longer comment. * See try_to_wake_up() for a longer comment. */ */ smp_rmb(); smp_rmb(); if (p->on_rq) if (p->on_rq) return true; return true; #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * Ensure the task has finished __schedule() and will * Ensure the task has finished __schedule() and will * anymore. Again, see try_to_wake_up() for a longer * anymore. Again, see try_to_wake_up() for a longer */ */ smp_rmb(); smp_rmb(); smp_cond_load_acquire(&p->on_cpu, !VAL); smp_cond_load_acquire(&p->on_cpu, !VAL); #endif #endif return false; return false; } } /** /** * task_call_func - Invoke a function on task in fixed state * task_call_func - Invoke a function on task in fixed state * @p: Process for which the function is to be invoked, can b * @p: Process for which the function is to be invoked, can b * @func: Function to invoke. * @func: Function to invoke. * @arg: Argument to function. * @arg: Argument to function. * * * Fix the task in it's current state by avoiding wakeups and * Fix the task in it's current state by avoiding wakeups and * and call @func(@arg) on it. This function can use ->on_rq * and call @func(@arg) on it. This function can use ->on_rq * to work out what the state is, if required. Given that @f * to work out what the state is, if required. Given that @f * with a runqueue lock held, it had better be quite lightwei * with a runqueue lock held, it had better be quite lightwei * * * Returns: * Returns: * Whatever @func returns * Whatever @func returns */ */ int task_call_func(struct task_struct *p, task_call_f func, v int task_call_func(struct task_struct *p, task_call_f func, v { { struct rq *rq = NULL; struct rq *rq = NULL; struct rq_flags rf; struct rq_flags rf; int ret; int ret; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (__task_needs_rq_lock(p)) if (__task_needs_rq_lock(p)) rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf); /* /* * At this point the task is pinned; either: * At this point the task is pinned; either: * - blocked and we're holding off wakeups (pi- * - blocked and we're holding off wakeups (pi- * - woken, and we're holding off enqueue (rq- * - woken, and we're holding off enqueue (rq- * - queued, and we're holding off schedule (rq- * - queued, and we're holding off schedule (rq- * - running, and we're holding off de-schedule (rq- * - running, and we're holding off de-schedule (rq- * * * The called function (@func) can use: task_curr(), * The called function (@func) can use: task_curr(), * p->__state to differentiate between these states. * p->__state to differentiate between these states. */ */ ret = func(p, arg); ret = func(p, arg); if (rq) if (rq) rq_unlock(rq, &rf); rq_unlock(rq, &rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; return ret; } } /** /** * cpu_curr_snapshot - Return a snapshot of the currently run * cpu_curr_snapshot - Return a snapshot of the currently run * @cpu: The CPU on which to snapshot the task. * @cpu: The CPU on which to snapshot the task. * * * Returns the task_struct pointer of the task "currently" ru * Returns the task_struct pointer of the task "currently" ru * the specified CPU. If the same task is running on that CP * the specified CPU. If the same task is running on that CP * the return value will be a pointer to that task's task_str * the return value will be a pointer to that task's task_str * If the CPU did any context switches even vaguely concurren * If the CPU did any context switches even vaguely concurren * execution of this function, the return value will be a poi * execution of this function, the return value will be a poi * task_struct structure of a randomly chosen task that was r * task_struct structure of a randomly chosen task that was r * that CPU somewhere around the time that this function was * that CPU somewhere around the time that this function was * * * If the specified CPU was offline, the return value is what * If the specified CPU was offline, the return value is what * is, perhaps a pointer to the task_struct structure of that * is, perhaps a pointer to the task_struct structure of that * task, but there is no guarantee. Callers wishing a useful * task, but there is no guarantee. Callers wishing a useful * value must take some action to ensure that the specified C * value must take some action to ensure that the specified C * online throughout. * online throughout. * * * This function executes full memory barriers before and aft * This function executes full memory barriers before and aft * the pointer, which permits the caller to confine this func * the pointer, which permits the caller to confine this func * with respect to the caller's accesses to other shared vari * with respect to the caller's accesses to other shared vari */ */ struct task_struct *cpu_curr_snapshot(int cpu) struct task_struct *cpu_curr_snapshot(int cpu) { { struct task_struct *t; struct task_struct *t; smp_mb(); /* Pairing determined by caller's synchroni smp_mb(); /* Pairing determined by caller's synchroni t = rcu_dereference(cpu_curr(cpu)); t = rcu_dereference(cpu_curr(cpu)); smp_mb(); /* Pairing determined by caller's synchroni smp_mb(); /* Pairing determined by caller's synchroni return t; return t; } } /** /** * wake_up_process - Wake up a specific process * wake_up_process - Wake up a specific process * @p: The process to be woken up. * @p: The process to be woken up. * * * Attempt to wake up the nominated process and move it to th * Attempt to wake up the nominated process and move it to th * processes. * processes. * * * Return: 1 if the process was woken up, 0 if it was already * Return: 1 if the process was woken up, 0 if it was already * * * This function executes a full memory barrier before access * This function executes a full memory barrier before access */ */ int wake_up_process(struct task_struct *p) int wake_up_process(struct task_struct *p) { { return try_to_wake_up(p, TASK_NORMAL, 0); return try_to_wake_up(p, TASK_NORMAL, 0); } } EXPORT_SYMBOL(wake_up_process); EXPORT_SYMBOL(wake_up_process); int wake_up_state(struct task_struct *p, unsigned int state) int wake_up_state(struct task_struct *p, unsigned int state) { { return try_to_wake_up(p, state, 0); return try_to_wake_up(p, state, 0); } } /* /* * Perform scheduler related setup for a newly forked process * Perform scheduler related setup for a newly forked process * p is forked by current. * p is forked by current. * * * __sched_fork() is basic setup used by init_idle() too: * __sched_fork() is basic setup used by init_idle() too: */ */ static void __sched_fork(unsigned long clone_flags, struct ta static void __sched_fork(unsigned long clone_flags, struct ta { { p->on_rq = 0; p->on_rq = 0; p->se.on_rq = 0; p->se.on_rq = 0; p->se.exec_start = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; p->se.vruntime = 0; p->se.vlag = 0; p->se.vlag = 0; p->se.slice = sysctl_sched_base_s p->se.slice = sysctl_sched_base_s INIT_LIST_HEAD(&p->se.group_node); INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = NULL; p->se.cfs_rq = NULL; #endif #endif #ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS /* Even if schedstat is disabled, there should not be /* Even if schedstat is disabled, there should not be memset(&p->stats, 0, sizeof(p->stats)); memset(&p->stats, 0, sizeof(p->stats)); #endif #endif RB_CLEAR_NODE(&p->dl.rb_node); RB_CLEAR_NODE(&p->dl.rb_node); init_dl_task_timer(&p->dl); init_dl_task_timer(&p->dl); init_dl_inactive_task_timer(&p->dl); init_dl_inactive_task_timer(&p->dl); __dl_clear_params(p); __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); INIT_LIST_HEAD(&p->rt.run_list); p->rt.timeout = 0; p->rt.timeout = 0; p->rt.time_slice = sched_rr_timeslice; p->rt.time_slice = sched_rr_timeslice; p->rt.on_rq = 0; p->rt.on_rq = 0; p->rt.on_list = 0; p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); INIT_HLIST_HEAD(&p->preempt_notifiers); #endif #endif #ifdef CONFIG_COMPACTION #ifdef CONFIG_COMPACTION p->capture_control = NULL; p->capture_control = NULL; #endif #endif init_numa_balancing(clone_flags, p); init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; p->wake_entry.u_flags = CSD_TYPE_TTWU; p->migration_pending = NULL; p->migration_pending = NULL; #endif #endif init_sched_mm_cid(p); init_sched_mm_cid(p); } } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); #ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING int sysctl_numa_balancing_mode; int sysctl_numa_balancing_mode; static void __set_numabalancing_state(bool enabled) static void __set_numabalancing_state(bool enabled) { { if (enabled) if (enabled) static_branch_enable(&sched_numa_balancing); static_branch_enable(&sched_numa_balancing); else else static_branch_disable(&sched_numa_balancing); static_branch_disable(&sched_numa_balancing); } } void set_numabalancing_state(bool enabled) void set_numabalancing_state(bool enabled) { { if (enabled) if (enabled) sysctl_numa_balancing_mode = NUMA_BALANCING_N sysctl_numa_balancing_mode = NUMA_BALANCING_N else else sysctl_numa_balancing_mode = NUMA_BALANCING_D sysctl_numa_balancing_mode = NUMA_BALANCING_D __set_numabalancing_state(enabled); __set_numabalancing_state(enabled); } } #ifdef CONFIG_PROC_SYSCTL #ifdef CONFIG_PROC_SYSCTL static void reset_memory_tiering(void) static void reset_memory_tiering(void) { { struct pglist_data *pgdat; struct pglist_data *pgdat; for_each_online_pgdat(pgdat) { for_each_online_pgdat(pgdat) { pgdat->nbp_threshold = 0; pgdat->nbp_threshold = 0; pgdat->nbp_th_nr_cand = node_page_state(pgdat pgdat->nbp_th_nr_cand = node_page_state(pgdat pgdat->nbp_th_start = jiffies_to_msecs(jiffie pgdat->nbp_th_start = jiffies_to_msecs(jiffie } } } } static int sysctl_numa_balancing(struct ctl_table *table, int static int sysctl_numa_balancing(struct ctl_table *table, int void *buffer, size_t *lenp, loff_t void *buffer, size_t *lenp, loff_t { { struct ctl_table t; struct ctl_table t; int err; int err; int state = sysctl_numa_balancing_mode; int state = sysctl_numa_balancing_mode; if (write && !capable(CAP_SYS_ADMIN)) if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; return -EPERM; t = *table; t = *table; t.data = &state; t.data = &state; err = proc_dointvec_minmax(&t, write, buffer, lenp, p err = proc_dointvec_minmax(&t, write, buffer, lenp, p if (err < 0) if (err < 0) return err; return err; if (write) { if (write) { if (!(sysctl_numa_balancing_mode & NUMA_BALAN if (!(sysctl_numa_balancing_mode & NUMA_BALAN (state & NUMA_BALANCING_MEMORY_TIERING)) (state & NUMA_BALANCING_MEMORY_TIERING)) reset_memory_tiering(); reset_memory_tiering(); sysctl_numa_balancing_mode = state; sysctl_numa_balancing_mode = state; __set_numabalancing_state(state); __set_numabalancing_state(state); } } return err; return err; } } #endif #endif #endif #endif #ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS DEFINE_STATIC_KEY_FALSE(sched_schedstats); DEFINE_STATIC_KEY_FALSE(sched_schedstats); static void set_schedstats(bool enabled) static void set_schedstats(bool enabled) { { if (enabled) if (enabled) static_branch_enable(&sched_schedstats); static_branch_enable(&sched_schedstats); else else static_branch_disable(&sched_schedstats); static_branch_disable(&sched_schedstats); } } void force_schedstat_enabled(void) void force_schedstat_enabled(void) { { if (!schedstat_enabled()) { if (!schedstat_enabled()) { pr_info("kernel profiling enabled schedstats, pr_info("kernel profiling enabled schedstats, static_branch_enable(&sched_schedstats); static_branch_enable(&sched_schedstats); } } } } static int __init setup_schedstats(char *str) static int __init setup_schedstats(char *str) { { int ret = 0; int ret = 0; if (!str) if (!str) goto out; goto out; if (!strcmp(str, "enable")) { if (!strcmp(str, "enable")) { set_schedstats(true); set_schedstats(true); ret = 1; ret = 1; } else if (!strcmp(str, "disable")) { } else if (!strcmp(str, "disable")) { set_schedstats(false); set_schedstats(false); ret = 1; ret = 1; } } out: out: if (!ret) if (!ret) pr_warn("Unable to parse schedstats=\n"); pr_warn("Unable to parse schedstats=\n"); return ret; return ret; } } __setup("schedstats=", setup_schedstats); __setup("schedstats=", setup_schedstats); #ifdef CONFIG_PROC_SYSCTL #ifdef CONFIG_PROC_SYSCTL static int sysctl_schedstats(struct ctl_table *table, int wri static int sysctl_schedstats(struct ctl_table *table, int wri size_t *lenp, loff_t *ppos) size_t *lenp, loff_t *ppos) { { struct ctl_table t; struct ctl_table t; int err; int err; int state = static_branch_likely(&sched_schedstats); int state = static_branch_likely(&sched_schedstats); if (write && !capable(CAP_SYS_ADMIN)) if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; return -EPERM; t = *table; t = *table; t.data = &state; t.data = &state; err = proc_dointvec_minmax(&t, write, buffer, lenp, p err = proc_dointvec_minmax(&t, write, buffer, lenp, p if (err < 0) if (err < 0) return err; return err; if (write) if (write) set_schedstats(state); set_schedstats(state); return err; return err; } } #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL static struct ctl_table sched_core_sysctls[] = { static struct ctl_table sched_core_sysctls[] = { #ifdef CONFIG_SCHEDSTATS #ifdef CONFIG_SCHEDSTATS { { .procname = "sched_schedstats", .procname = "sched_schedstats", .data = NULL, .data = NULL, .maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int), .mode = 0644, .mode = 0644, .proc_handler = sysctl_schedstats, .proc_handler = sysctl_schedstats, .extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, .extra2 = SYSCTL_ONE, }, }, #endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SCHEDSTATS */ #ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK { { .procname = "sched_util_clamp_min", .procname = "sched_util_clamp_min", .data = &sysctl_sched_uclamp_util_m .data = &sysctl_sched_uclamp_util_m .maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int), .mode = 0644, .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler .proc_handler = sysctl_sched_uclamp_handler }, }, { { .procname = "sched_util_clamp_max", .procname = "sched_util_clamp_max", .data = &sysctl_sched_uclamp_util_m .data = &sysctl_sched_uclamp_util_m .maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int), .mode = 0644, .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler .proc_handler = sysctl_sched_uclamp_handler }, }, { { .procname = "sched_util_clamp_min_rt_de .procname = "sched_util_clamp_min_rt_de .data = &sysctl_sched_uclamp_util_m .data = &sysctl_sched_uclamp_util_m .maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int), .mode = 0644, .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler .proc_handler = sysctl_sched_uclamp_handler }, }, #endif /* CONFIG_UCLAMP_TASK */ #endif /* CONFIG_UCLAMP_TASK */ #ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING { { .procname = "numa_balancing", .procname = "numa_balancing", .data = NULL, /* filled in by handl .data = NULL, /* filled in by handl .maxlen = sizeof(unsigned int), .maxlen = sizeof(unsigned int), .mode = 0644, .mode = 0644, .proc_handler = sysctl_numa_balancing, .proc_handler = sysctl_numa_balancing, .extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_FOUR, .extra2 = SYSCTL_FOUR, }, }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */ {} {} }; }; static int __init sched_core_sysctl_init(void) static int __init sched_core_sysctl_init(void) { { register_sysctl_init("kernel", sched_core_sysctls); register_sysctl_init("kernel", sched_core_sysctls); return 0; return 0; } } late_initcall(sched_core_sysctl_init); late_initcall(sched_core_sysctl_init); #endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SYSCTL */ /* /* * fork()/clone()-time setup: * fork()/clone()-time setup: */ */ int sched_fork(unsigned long clone_flags, struct task_struct int sched_fork(unsigned long clone_flags, struct task_struct { { __sched_fork(clone_flags, p); __sched_fork(clone_flags, p); /* /* * We mark the process as NEW here. This guarantees t * We mark the process as NEW here. This guarantees t * nobody will actually run it, and a signal or other * nobody will actually run it, and a signal or other * event cannot wake it up and insert it on the runqu * event cannot wake it up and insert it on the runqu */ */ p->__state = TASK_NEW; p->__state = TASK_NEW; /* /* * Make sure we do not leak PI boosting priority to t * Make sure we do not leak PI boosting priority to t */ */ p->prio = current->normal_prio; p->prio = current->normal_prio; uclamp_fork(p); uclamp_fork(p); /* /* * Revert to default priority/policy on fork if reque * Revert to default priority/policy on fork if reque */ */ if (unlikely(p->sched_reset_on_fork)) { if (unlikely(p->sched_reset_on_fork)) { if (task_has_dl_policy(p) || task_has_rt_poli if (task_has_dl_policy(p) || task_has_rt_poli p->policy = SCHED_NORMAL; p->policy = SCHED_NORMAL; p->static_prio = NICE_TO_PRIO(0); p->static_prio = NICE_TO_PRIO(0); p->rt_priority = 0; p->rt_priority = 0; } else if (PRIO_TO_NICE(p->static_prio) < 0) } else if (PRIO_TO_NICE(p->static_prio) < 0) p->static_prio = NICE_TO_PRIO(0); p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = p->static_prio; p->prio = p->normal_prio = p->static_prio; set_load_weight(p, false); set_load_weight(p, false); /* /* * We don't need the reset flag anymore after * We don't need the reset flag anymore after * fulfilled its duty: * fulfilled its duty: */ */ p->sched_reset_on_fork = 0; p->sched_reset_on_fork = 0; } } if (dl_prio(p->prio)) if (dl_prio(p->prio)) return -EAGAIN; return -EAGAIN; else if (rt_prio(p->prio)) else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; p->sched_class = &rt_sched_class; else else p->sched_class = &fair_sched_class; p->sched_class = &fair_sched_class; init_entity_runnable_average(&p->se); init_entity_runnable_average(&p->se); #ifdef CONFIG_SCHED_INFO #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_inf memset(&p->sched_info, 0, sizeof(p->sched_inf #endif #endif #if defined(CONFIG_SMP) #if defined(CONFIG_SMP) p->on_cpu = 0; p->on_cpu = 0; #endif #endif init_task_preempt_count(p); init_task_preempt_count(p); #ifdef CONFIG_SMP #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); RB_CLEAR_NODE(&p->pushable_dl_tasks); #endif #endif return 0; return 0; } } void sched_cgroup_fork(struct task_struct *p, struct kernel_c void sched_cgroup_fork(struct task_struct *p, struct kernel_c { { unsigned long flags; unsigned long flags; /* /* * Because we're not yet on the pid-hash, p->pi_lock * Because we're not yet on the pid-hash, p->pi_lock * required yet, but lockdep gets upset if rules are * required yet, but lockdep gets upset if rules are */ */ raw_spin_lock_irqsave(&p->pi_lock, flags); raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED if (1) { if (1) { struct task_group *tg; struct task_group *tg; tg = container_of(kargs->cset->subsys[cpu_cgr tg = container_of(kargs->cset->subsys[cpu_cgr struct task_group, css); struct task_group, css); tg = autogroup_task_group(p, tg); tg = autogroup_task_group(p, tg); p->sched_task_group = tg; p->sched_task_group = tg; } } #endif #endif rseq_migrate(p); rseq_migrate(p); /* /* * We're setting the CPU for the first time, we don't * We're setting the CPU for the first time, we don't * so use __set_task_cpu(). * so use __set_task_cpu(). */ */ __set_task_cpu(p, smp_processor_id()); __set_task_cpu(p, smp_processor_id()); if (p->sched_class->task_fork) if (p->sched_class->task_fork) p->sched_class->task_fork(p); p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags); } } void sched_post_fork(struct task_struct *p) void sched_post_fork(struct task_struct *p) { { uclamp_post_fork(p); uclamp_post_fork(p); } } unsigned long to_ratio(u64 period, u64 runtime) unsigned long to_ratio(u64 period, u64 runtime) { { if (runtime == RUNTIME_INF) if (runtime == RUNTIME_INF) return BW_UNIT; return BW_UNIT; /* /* * Doing this here saves a lot of checks in all * Doing this here saves a lot of checks in all * the calling paths, and returning zero seems * the calling paths, and returning zero seems * safe for them anyway. * safe for them anyway. */ */ if (period == 0) if (period == 0) return 0; return 0; return div64_u64(runtime << BW_SHIFT, period); return div64_u64(runtime << BW_SHIFT, period); } } /* /* * wake_up_new_task - wake up a newly created task for the fi * wake_up_new_task - wake up a newly created task for the fi * * * This function will do some initial scheduler statistics ho * This function will do some initial scheduler statistics ho * that must be done for every newly created context, then pu * that must be done for every newly created context, then pu * on the runqueue and wakes it. * on the runqueue and wakes it. */ */ void wake_up_new_task(struct task_struct *p) void wake_up_new_task(struct task_struct *p) { { struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; raw_spin_lock_irqsave(&p->pi_lock, rf.flags); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); WRITE_ONCE(p->__state, TASK_RUNNING); WRITE_ONCE(p->__state, TASK_RUNNING); #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * Fork balancing, do it here and not earlier because * Fork balancing, do it here and not earlier because * - cpus_ptr can change in the fork path * - cpus_ptr can change in the fork path * - any previously selected CPU might disappear thr * - any previously selected CPU might disappear thr * * * Use __set_task_cpu() to avoid calling sched_class: * Use __set_task_cpu() to avoid calling sched_class: * as we're not fully set-up yet. * as we're not fully set-up yet. */ */ p->recent_used_cpu = task_cpu(p); p->recent_used_cpu = task_cpu(p); rseq_migrate(p); rseq_migrate(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_F __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_F #endif #endif rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf); update_rq_clock(rq); update_rq_clock(rq); post_init_entity_util_avg(p); post_init_entity_util_avg(p); activate_task(rq, p, ENQUEUE_NOCLOCK); activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); check_preempt_curr(rq, p, WF_FORK); #ifdef CONFIG_SMP #ifdef CONFIG_SMP if (p->sched_class->task_woken) { if (p->sched_class->task_woken) { /* /* * Nothing relies on rq->lock after this, so * Nothing relies on rq->lock after this, so * drop it. * drop it. */ */ rq_unpin_lock(rq, &rf); rq_unpin_lock(rq, &rf); p->sched_class->task_woken(rq, p); p->sched_class->task_woken(rq, p); rq_repin_lock(rq, &rf); rq_repin_lock(rq, &rf); } } #endif #endif task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); } } #ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); void preempt_notifier_inc(void) void preempt_notifier_inc(void) { { static_branch_inc(&preempt_notifier_key); static_branch_inc(&preempt_notifier_key); } } EXPORT_SYMBOL_GPL(preempt_notifier_inc); EXPORT_SYMBOL_GPL(preempt_notifier_inc); void preempt_notifier_dec(void) void preempt_notifier_dec(void) { { static_branch_dec(&preempt_notifier_key); static_branch_dec(&preempt_notifier_key); } } EXPORT_SYMBOL_GPL(preempt_notifier_dec); EXPORT_SYMBOL_GPL(preempt_notifier_dec); /** /** * preempt_notifier_register - tell me when current is being * preempt_notifier_register - tell me when current is being * @notifier: notifier struct to register * @notifier: notifier struct to register */ */ void preempt_notifier_register(struct preempt_notifier *notif void preempt_notifier_register(struct preempt_notifier *notif { { if (!static_branch_unlikely(&preempt_notifier_key)) if (!static_branch_unlikely(&preempt_notifier_key)) WARN(1, "registering preempt_notifier while n WARN(1, "registering preempt_notifier while n hlist_add_head(¬ifier->link, ¤t->preempt_not hlist_add_head(¬ifier->link, ¤t->preempt_not } } EXPORT_SYMBOL_GPL(preempt_notifier_register); EXPORT_SYMBOL_GPL(preempt_notifier_register); /** /** * preempt_notifier_unregister - no longer interested in pree * preempt_notifier_unregister - no longer interested in pree * @notifier: notifier struct to unregister * @notifier: notifier struct to unregister * * * This is *not* safe to call from within a preemption notifi * This is *not* safe to call from within a preemption notifi */ */ void preempt_notifier_unregister(struct preempt_notifier *not void preempt_notifier_unregister(struct preempt_notifier *not { { hlist_del(¬ifier->link); hlist_del(¬ifier->link); } } EXPORT_SYMBOL_GPL(preempt_notifier_unregister); EXPORT_SYMBOL_GPL(preempt_notifier_unregister); static void __fire_sched_in_preempt_notifiers(struct task_str static void __fire_sched_in_preempt_notifiers(struct task_str { { struct preempt_notifier *notifier; struct preempt_notifier *notifier; hlist_for_each_entry(notifier, &curr->preempt_notifie hlist_for_each_entry(notifier, &curr->preempt_notifie notifier->ops->sched_in(notifier, raw_smp_pro notifier->ops->sched_in(notifier, raw_smp_pro } } static __always_inline void fire_sched_in_preempt_notifiers(s static __always_inline void fire_sched_in_preempt_notifiers(s { { if (static_branch_unlikely(&preempt_notifier_key)) if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_in_preempt_notifiers(curr); __fire_sched_in_preempt_notifiers(curr); } } static void static void __fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) struct task_struct *next) { { struct preempt_notifier *notifier; struct preempt_notifier *notifier; hlist_for_each_entry(notifier, &curr->preempt_notifie hlist_for_each_entry(notifier, &curr->preempt_notifie notifier->ops->sched_out(notifier, next); notifier->ops->sched_out(notifier, next); } } static __always_inline void static __always_inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) struct task_struct *next) { { if (static_branch_unlikely(&preempt_notifier_key)) if (static_branch_unlikely(&preempt_notifier_key)) __fire_sched_out_preempt_notifiers(curr, next __fire_sched_out_preempt_notifiers(curr, next } } #else /* !CONFIG_PREEMPT_NOTIFIERS */ #else /* !CONFIG_PREEMPT_NOTIFIERS */ static inline void fire_sched_in_preempt_notifiers(struct tas static inline void fire_sched_in_preempt_notifiers(struct tas { { } } static inline void static inline void fire_sched_out_preempt_notifiers(struct task_struct *curr, fire_sched_out_preempt_notifiers(struct task_struct *curr, struct task_struct *next) struct task_struct *next) { { } } #endif /* CONFIG_PREEMPT_NOTIFIERS */ #endif /* CONFIG_PREEMPT_NOTIFIERS */ static inline void prepare_task(struct task_struct *next) static inline void prepare_task(struct task_struct *next) { { #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * Claim the task as running, we do this before switc * Claim the task as running, we do this before switc * such that any running task will have this set. * such that any running task will have this set. * * * See the smp_load_acquire(&p->on_cpu) case in ttwu( * See the smp_load_acquire(&p->on_cpu) case in ttwu( * its ordering comment. * its ordering comment. */ */ WRITE_ONCE(next->on_cpu, 1); WRITE_ONCE(next->on_cpu, 1); #endif #endif } } static inline void finish_task(struct task_struct *prev) static inline void finish_task(struct task_struct *prev) { { #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * This must be the very last reference to @prev from * This must be the very last reference to @prev from * p->on_cpu is cleared, the task can be moved to a d * p->on_cpu is cleared, the task can be moved to a d * must ensure this doesn't happen until the switch i * must ensure this doesn't happen until the switch i * finished. * finished. * * * In particular, the load of prev->state in finish_t * In particular, the load of prev->state in finish_t * happen before this. * happen before this. * * * Pairs with the smp_cond_load_acquire() in try_to_w * Pairs with the smp_cond_load_acquire() in try_to_w */ */ smp_store_release(&prev->on_cpu, 0); smp_store_release(&prev->on_cpu, 0); #endif #endif } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP static void do_balance_callbacks(struct rq *rq, struct balanc static void do_balance_callbacks(struct rq *rq, struct balanc { { void (*func)(struct rq *rq); void (*func)(struct rq *rq); struct balance_callback *next; struct balance_callback *next; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); while (head) { while (head) { func = (void (*)(struct rq *))head->func; func = (void (*)(struct rq *))head->func; next = head->next; next = head->next; head->next = NULL; head->next = NULL; head = next; head = next; func(rq); func(rq); } } } } static void balance_push(struct rq *rq); static void balance_push(struct rq *rq); /* /* * balance_push_callback is a right abuse of the callback int * balance_push_callback is a right abuse of the callback int * by significantly different rules. * by significantly different rules. * * * Where the normal balance_callback's purpose is to be ran i * Where the normal balance_callback's purpose is to be ran i * that queued it (only later, when it's safe to drop rq->loc * that queued it (only later, when it's safe to drop rq->loc * balance_push_callback is specifically targeted at __schedu * balance_push_callback is specifically targeted at __schedu * * * This abuse is tolerated because it places all the unlikely * This abuse is tolerated because it places all the unlikely * a single test, namely: rq->balance_callback == NULL. * a single test, namely: rq->balance_callback == NULL. */ */ struct balance_callback balance_push_callback = { struct balance_callback balance_push_callback = { .next = NULL, .next = NULL, .func = balance_push, .func = balance_push, }; }; static inline struct balance_callback * static inline struct balance_callback * __splice_balance_callbacks(struct rq *rq, bool split) __splice_balance_callbacks(struct rq *rq, bool split) { { struct balance_callback *head = rq->balance_callback; struct balance_callback *head = rq->balance_callback; if (likely(!head)) if (likely(!head)) return NULL; return NULL; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); /* /* * Must not take balance_push_callback off the list w * Must not take balance_push_callback off the list w * splice_balance_callbacks() and balance_callbacks() * splice_balance_callbacks() and balance_callbacks() * in the same rq->lock section. * in the same rq->lock section. * * * In that case it would be possible for __schedule() * In that case it would be possible for __schedule() * and observe the list empty. * and observe the list empty. */ */ if (split && head == &balance_push_callback) if (split && head == &balance_push_callback) head = NULL; head = NULL; else else rq->balance_callback = NULL; rq->balance_callback = NULL; return head; return head; } } static inline struct balance_callback *splice_balance_callbac static inline struct balance_callback *splice_balance_callbac { { return __splice_balance_callbacks(rq, true); return __splice_balance_callbacks(rq, true); } } static void __balance_callbacks(struct rq *rq) static void __balance_callbacks(struct rq *rq) { { do_balance_callbacks(rq, __splice_balance_callbacks(r do_balance_callbacks(rq, __splice_balance_callbacks(r } } static inline void balance_callbacks(struct rq *rq, struct ba static inline void balance_callbacks(struct rq *rq, struct ba { { unsigned long flags; unsigned long flags; if (unlikely(head)) { if (unlikely(head)) { raw_spin_rq_lock_irqsave(rq, flags); raw_spin_rq_lock_irqsave(rq, flags); do_balance_callbacks(rq, head); do_balance_callbacks(rq, head); raw_spin_rq_unlock_irqrestore(rq, flags); raw_spin_rq_unlock_irqrestore(rq, flags); } } } } #else #else static inline void __balance_callbacks(struct rq *rq) static inline void __balance_callbacks(struct rq *rq) { { } } static inline struct balance_callback *splice_balance_callbac static inline struct balance_callback *splice_balance_callbac { { return NULL; return NULL; } } static inline void balance_callbacks(struct rq *rq, struct ba static inline void balance_callbacks(struct rq *rq, struct ba { { } } #endif #endif static inline void static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, prepare_lock_switch(struct rq *rq, struct task_struct *next, { { /* /* * Since the runqueue lock will be released by the ne * Since the runqueue lock will be released by the ne * task (which is an invalid locking op but in the ca * task (which is an invalid locking op but in the ca * of the scheduler it's an obvious special-case), so * of the scheduler it's an obvious special-case), so * do an early lockdep release here: * do an early lockdep release here: */ */ rq_unpin_lock(rq, rf); rq_unpin_lock(rq, rf); spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases th /* this is a valid case when another task releases th rq_lockp(rq)->owner = next; rq_lockp(rq)->owner = next; #endif #endif } } static inline void finish_lock_switch(struct rq *rq) static inline void finish_lock_switch(struct rq *rq) { { /* /* * If we are tracking spinlock dependencies then we h * If we are tracking spinlock dependencies then we h * fix up the runqueue lock - which gets 'carried ove * fix up the runqueue lock - which gets 'carried ove * prev into current: * prev into current: */ */ spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP __balance_callbacks(rq); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); raw_spin_rq_unlock_irq(rq); } } /* /* * NOP if the arch has not defined these: * NOP if the arch has not defined these: */ */ #ifndef prepare_arch_switch #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) # define prepare_arch_switch(next) do { } while (0) #endif #endif #ifndef finish_arch_post_lock_switch #ifndef finish_arch_post_lock_switch # define finish_arch_post_lock_switch() do { } while (0) # define finish_arch_post_lock_switch() do { } while (0) #endif #endif static inline void kmap_local_sched_out(void) static inline void kmap_local_sched_out(void) { { #ifdef CONFIG_KMAP_LOCAL #ifdef CONFIG_KMAP_LOCAL if (unlikely(current->kmap_ctrl.idx)) if (unlikely(current->kmap_ctrl.idx)) __kmap_local_sched_out(); __kmap_local_sched_out(); #endif #endif } } static inline void kmap_local_sched_in(void) static inline void kmap_local_sched_in(void) { { #ifdef CONFIG_KMAP_LOCAL #ifdef CONFIG_KMAP_LOCAL if (unlikely(current->kmap_ctrl.idx)) if (unlikely(current->kmap_ctrl.idx)) __kmap_local_sched_in(); __kmap_local_sched_in(); #endif #endif } } /** /** * prepare_task_switch - prepare to switch tasks * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch * @rq: the runqueue preparing to switch * @prev: the current task that is being switched out * @prev: the current task that is being switched out * @next: the task we are going to switch to. * @next: the task we are going to switch to. * * * This is called with the rq lock held and interrupts off. I * This is called with the rq lock held and interrupts off. I * be paired with a subsequent finish_task_switch after the c * be paired with a subsequent finish_task_switch after the c * switch. * switch. * * * prepare_task_switch sets up locking and calls architecture * prepare_task_switch sets up locking and calls architecture * hooks. * hooks. */ */ static inline void static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev, prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) struct task_struct *next) { { kcov_prepare_switch(prev); kcov_prepare_switch(prev); sched_info_switch(rq, prev, next); sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); kmap_local_sched_out(); prepare_task(next); prepare_task(next); prepare_arch_switch(next); prepare_arch_switch(next); } } /** /** * finish_task_switch - clean up after a task-switch * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. * @prev: the thread we just switched away from. * * * finish_task_switch must be called after the context switch * finish_task_switch must be called after the context switch * with a prepare_task_switch call before the context switch. * with a prepare_task_switch call before the context switch. * finish_task_switch will reconcile locking set up by prepar * finish_task_switch will reconcile locking set up by prepar * and do any other architecture-specific cleanup actions. * and do any other architecture-specific cleanup actions. * * * Note that we may have delayed dropping an mm in context_sw * Note that we may have delayed dropping an mm in context_sw * so, we finish that here outside of the runqueue lock. (Doi * so, we finish that here outside of the runqueue lock. (Doi * with the lock held can cause deadlocks; see schedule() for * with the lock held can cause deadlocks; see schedule() for * details.) * details.) * * * The context switch have flipped the stack from under us an * The context switch have flipped the stack from under us an * local variables which were saved when this task called sch * local variables which were saved when this task called sch * past. prev == current is still correct but we need to reca * past. prev == current is still correct but we need to reca * because prev may have moved to another CPU. * because prev may have moved to another CPU. */ */ static struct rq *finish_task_switch(struct task_struct *prev static struct rq *finish_task_switch(struct task_struct *prev __releases(rq->lock) __releases(rq->lock) { { struct rq *rq = this_rq(); struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; struct mm_struct *mm = rq->prev_mm; unsigned int prev_state; unsigned int prev_state; /* /* * The previous task will have left us with a preempt * The previous task will have left us with a preempt * because it left us after: * because it left us after: * * * schedule() * schedule() * preempt_disable(); // 1 * preempt_disable(); // 1 * __schedule() * __schedule() * raw_spin_lock_irq(&rq->lock) // 2 * raw_spin_lock_irq(&rq->lock) // 2 * * * Also, see FORK_PREEMPT_COUNT. * Also, see FORK_PREEMPT_COUNT. */ */ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OF if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OF "corrupted preempt_count: %s/%d/0x%x\n" "corrupted preempt_count: %s/%d/0x%x\n" current->comm, current->pid, preempt_co current->comm, current->pid, preempt_co preempt_count_set(FORK_PREEMPT_COUNT); preempt_count_set(FORK_PREEMPT_COUNT); rq->prev_mm = NULL; rq->prev_mm = NULL; /* /* * A task struct has one reference for the use as "cu * A task struct has one reference for the use as "cu * If a task dies, then it sets TASK_DEAD in tsk->sta * If a task dies, then it sets TASK_DEAD in tsk->sta * schedule one last time. The schedule call will nev * schedule one last time. The schedule call will nev * the scheduled task must drop that reference. * the scheduled task must drop that reference. * * * We must observe prev->state before clearing prev-> * We must observe prev->state before clearing prev-> * finish_task), otherwise a concurrent wakeup can ge * finish_task), otherwise a concurrent wakeup can ge * running on another CPU and we could rave with its * running on another CPU and we could rave with its * transition, resulting in a double drop. * transition, resulting in a double drop. */ */ prev_state = READ_ONCE(prev->__state); prev_state = READ_ONCE(prev->__state); vtime_task_switch(prev); vtime_task_switch(prev); perf_event_task_sched_in(prev, current); perf_event_task_sched_in(prev, current); finish_task(prev); finish_task(prev); tick_nohz_task_switch(); tick_nohz_task_switch(); finish_lock_switch(rq); finish_lock_switch(rq); finish_arch_post_lock_switch(); finish_arch_post_lock_switch(); kcov_finish_switch(current); kcov_finish_switch(current); /* /* * kmap_local_sched_out() is invoked with rq::lock he * kmap_local_sched_out() is invoked with rq::lock he * interrupts disabled. There is no requirement for t * interrupts disabled. There is no requirement for t * sched out code does not have an interrupt enabled * sched out code does not have an interrupt enabled * Restoring the maps on sched in does not require in * Restoring the maps on sched in does not require in * disabled either. * disabled either. */ */ kmap_local_sched_in(); kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); fire_sched_in_preempt_notifiers(current); /* /* * When switching through a kernel thread, the loop i * When switching through a kernel thread, the loop i * membarrier_{private,global}_expedited() may have o * membarrier_{private,global}_expedited() may have o * kernel thread and not issued an IPI. It is therefo * kernel thread and not issued an IPI. It is therefo * schedule between user->kernel->user threads withou * schedule between user->kernel->user threads withou * switch_mm(). Membarrier requires a barrier after s * switch_mm(). Membarrier requires a barrier after s * rq->curr, before returning to userspace, so provid * rq->curr, before returning to userspace, so provid * * * - a full memory barrier for {PRIVATE,GLOBAL}_EXPED * - a full memory barrier for {PRIVATE,GLOBAL}_EXPED * provided by mmdrop_lazy_tlb(), * provided by mmdrop_lazy_tlb(), * - a sync_core for SYNC_CORE. * - a sync_core for SYNC_CORE. */ */ if (mm) { if (mm) { membarrier_mm_sync_core_before_usermode(mm); membarrier_mm_sync_core_before_usermode(mm); mmdrop_lazy_tlb_sched(mm); mmdrop_lazy_tlb_sched(mm); } } if (unlikely(prev_state == TASK_DEAD)) { if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); prev->sched_class->task_dead(prev); /* Task is done with its stack. */ /* Task is done with its stack. */ put_task_stack(prev); put_task_stack(prev); put_task_struct_rcu_user(prev); put_task_struct_rcu_user(prev); } } return rq; return rq; } } /** /** * schedule_tail - first thing a freshly forked thread must c * schedule_tail - first thing a freshly forked thread must c * @prev: the thread we just switched away from. * @prev: the thread we just switched away from. */ */ asmlinkage __visible void schedule_tail(struct task_struct *p asmlinkage __visible void schedule_tail(struct task_struct *p __releases(rq->lock) __releases(rq->lock) { { /* /* * New tasks start with FORK_PREEMPT_COUNT, see there * New tasks start with FORK_PREEMPT_COUNT, see there * finish_task_switch() for details. * finish_task_switch() for details. * * * finish_task_switch() will drop rq->lock() and lowe * finish_task_switch() will drop rq->lock() and lowe * and the preempt_enable() will end up enabling pree * and the preempt_enable() will end up enabling pree * PREEMPT_COUNT kernels). * PREEMPT_COUNT kernels). */ */ finish_task_switch(prev); finish_task_switch(prev); preempt_enable(); preempt_enable(); if (current->set_child_tid) if (current->set_child_tid) put_user(task_pid_vnr(current), current->set_ put_user(task_pid_vnr(current), current->set_ calculate_sigpending(); calculate_sigpending(); } } /* /* * context_switch - switch to the new MM and the new thread's * context_switch - switch to the new MM and the new thread's */ */ static __always_inline struct rq * static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) struct task_struct *next, struct rq_flags *rf) { { prepare_task_switch(rq, prev, next); prepare_task_switch(rq, prev, next); /* /* * For paravirt, this is coupled with an exit in swit * For paravirt, this is coupled with an exit in swit * combine the page table reload and the switch backe * combine the page table reload and the switch backe * one hypercall. * one hypercall. */ */ arch_start_context_switch(prev); arch_start_context_switch(prev); /* /* * kernel -> kernel lazy + transfer active * kernel -> kernel lazy + transfer active * user -> kernel lazy + mmgrab_lazy_tlb() active * user -> kernel lazy + mmgrab_lazy_tlb() active * * * kernel -> user switch + mmdrop_lazy_tlb() acti * kernel -> user switch + mmdrop_lazy_tlb() acti * user -> user switch * user -> user switch * * * switch_mm_cid() needs to be updated if the barrier * switch_mm_cid() needs to be updated if the barrier * by context_switch() are modified. * by context_switch() are modified. */ */ if (!next->mm) { // to if (!next->mm) { // to enter_lazy_tlb(prev->active_mm, next); enter_lazy_tlb(prev->active_mm, next); next->active_mm = prev->active_mm; next->active_mm = prev->active_mm; if (prev->mm) // fr if (prev->mm) // fr mmgrab_lazy_tlb(prev->active_mm); mmgrab_lazy_tlb(prev->active_mm); else else prev->active_mm = NULL; prev->active_mm = NULL; } else { // to } else { // to membarrier_switch_mm(rq, prev->active_mm, nex membarrier_switch_mm(rq, prev->active_mm, nex /* /* * sys_membarrier() requires an smp_mb() betw * sys_membarrier() requires an smp_mb() betw * rq->curr / membarrier_switch_mm() and retu * rq->curr / membarrier_switch_mm() and retu * * * The below provides this either through swi * The below provides this either through swi * case 'prev->active_mm == next->mm' through * case 'prev->active_mm == next->mm' through * finish_task_switch()'s mmdrop(). * finish_task_switch()'s mmdrop(). */ */ switch_mm_irqs_off(prev->active_mm, next->mm, switch_mm_irqs_off(prev->active_mm, next->mm, lru_gen_use_mm(next->mm); lru_gen_use_mm(next->mm); if (!prev->mm) { // fr if (!prev->mm) { // fr /* will mmdrop_lazy_tlb() in finish_t /* will mmdrop_lazy_tlb() in finish_t rq->prev_mm = prev->active_mm; rq->prev_mm = prev->active_mm; prev->active_mm = NULL; prev->active_mm = NULL; } } } } /* switch_mm_cid() requires the memory barriers above /* switch_mm_cid() requires the memory barriers above switch_mm_cid(rq, prev, next); switch_mm_cid(rq, prev, next); prepare_lock_switch(rq, next, rf); prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the sta /* Here we just switch the register state and the sta switch_to(prev, next, prev); switch_to(prev, next, prev); barrier(); barrier(); return finish_task_switch(prev); return finish_task_switch(prev); } } /* /* * nr_running and nr_context_switches: * nr_running and nr_context_switches: * * * externally visible scheduler statistics: current number of * externally visible scheduler statistics: current number of * threads, total number of context switches performed since * threads, total number of context switches performed since */ */ unsigned int nr_running(void) unsigned int nr_running(void) { { unsigned int i, sum = 0; unsigned int i, sum = 0; for_each_online_cpu(i) for_each_online_cpu(i) sum += cpu_rq(i)->nr_running; sum += cpu_rq(i)->nr_running; return sum; return sum; } } /* /* * Check if only the current task is running on the CPU. * Check if only the current task is running on the CPU. * * * Caution: this function does not check that the caller has * Caution: this function does not check that the caller has * preemption, thus the result might have a time-of-check-to- * preemption, thus the result might have a time-of-check-to- * race. The caller is responsible to use it correctly, for * race. The caller is responsible to use it correctly, for * * * - from a non-preemptible section (of course) * - from a non-preemptible section (of course) * * * - from a thread that is bound to a single CPU * - from a thread that is bound to a single CPU * * * - in a loop with very short iterations (e.g. a polling loo * - in a loop with very short iterations (e.g. a polling loo */ */ bool single_task_running(void) bool single_task_running(void) { { return raw_rq()->nr_running == 1; return raw_rq()->nr_running == 1; } } EXPORT_SYMBOL(single_task_running); EXPORT_SYMBOL(single_task_running); unsigned long long nr_context_switches_cpu(int cpu) unsigned long long nr_context_switches_cpu(int cpu) { { return cpu_rq(cpu)->nr_switches; return cpu_rq(cpu)->nr_switches; } } unsigned long long nr_context_switches(void) unsigned long long nr_context_switches(void) { { int i; int i; unsigned long long sum = 0; unsigned long long sum = 0; for_each_possible_cpu(i) for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches; sum += cpu_rq(i)->nr_switches; return sum; return sum; } } /* /* * Consumers of these two interfaces, like for example the cp * Consumers of these two interfaces, like for example the cp * governor, are using nonsensical data. Preferring shallow i * governor, are using nonsensical data. Preferring shallow i * for a CPU that has IO-wait which might not even end up run * for a CPU that has IO-wait which might not even end up run * it does become runnable. * it does become runnable. */ */ unsigned int nr_iowait_cpu(int cpu) unsigned int nr_iowait_cpu(int cpu) { { return atomic_read(&cpu_rq(cpu)->nr_iowait); return atomic_read(&cpu_rq(cpu)->nr_iowait); } } /* /* * IO-wait accounting, and how it's mostly bollocks (on SMP). * IO-wait accounting, and how it's mostly bollocks (on SMP). * * * The idea behind IO-wait account is to account the idle tim * The idea behind IO-wait account is to account the idle tim * have spend running if it were not for IO. That is, if we w * have spend running if it were not for IO. That is, if we w * storage performance, we'd have a proportional reduction in * storage performance, we'd have a proportional reduction in * * * This all works nicely on UP, where, when a task blocks on * This all works nicely on UP, where, when a task blocks on * idle time as IO-wait, because if the storage were faster, * idle time as IO-wait, because if the storage were faster, * running and we'd not be idle. * running and we'd not be idle. * * * This has been extended to SMP, by doing the same for each * This has been extended to SMP, by doing the same for each * is broken. * is broken. * * * Imagine for instance the case where two tasks block on one * Imagine for instance the case where two tasks block on one * CPU will have IO-wait accounted, while the other has regul * CPU will have IO-wait accounted, while the other has regul * though, if the storage were faster, both could've ran at t * though, if the storage were faster, both could've ran at t * utilising both CPUs. * utilising both CPUs. * * * This means, that when looking globally, the current IO-wai * This means, that when looking globally, the current IO-wai * SMP is a lower bound, by reason of under accounting. * SMP is a lower bound, by reason of under accounting. * * * Worse, since the numbers are provided per CPU, they are so * Worse, since the numbers are provided per CPU, they are so * interpreted per CPU, and that is nonsensical. A blocked ta * interpreted per CPU, and that is nonsensical. A blocked ta * associated with any one particular CPU, it can wake to ano * associated with any one particular CPU, it can wake to ano * blocked on. This means the per CPU IO-wait number is meani * blocked on. This means the per CPU IO-wait number is meani * * * Task CPU affinities can make all that even more 'interesti * Task CPU affinities can make all that even more 'interesti */ */ unsigned int nr_iowait(void) unsigned int nr_iowait(void) { { unsigned int i, sum = 0; unsigned int i, sum = 0; for_each_possible_cpu(i) for_each_possible_cpu(i) sum += nr_iowait_cpu(i); sum += nr_iowait_cpu(i); return sum; return sum; } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * sched_exec - execve() is a valuable balancing opportunity, * sched_exec - execve() is a valuable balancing opportunity, * this point the task has the smallest effective memory and * this point the task has the smallest effective memory and */ */ void sched_exec(void) void sched_exec(void) { { struct task_struct *p = current; struct task_struct *p = current; struct migration_arg arg; struct migration_arg arg; int dest_cpu; int dest_cpu; scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { dest_cpu = p->sched_class->select_task_rq(p, dest_cpu = p->sched_class->select_task_rq(p, if (dest_cpu == smp_processor_id()) if (dest_cpu == smp_processor_id()) return; return; if (unlikely(!cpu_active(dest_cpu))) if (unlikely(!cpu_active(dest_cpu))) return; return; arg = (struct migration_arg){ p, dest_cpu }; arg = (struct migration_arg){ p, dest_cpu }; } } stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); } } #endif #endif DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_stat, kstat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* /* * The function fair_sched_class.update_curr accesses the str * The function fair_sched_class.update_curr accesses the str * and its field curr->exec_start; when called from task_sche * and its field curr->exec_start; when called from task_sche * we observe a high rate of cache misses in practice. * we observe a high rate of cache misses in practice. * Prefetching this data results in improved performance. * Prefetching this data results in improved performance. */ */ static inline void prefetch_curr_exec_start(struct task_struc static inline void prefetch_curr_exec_start(struct task_struc { { #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *curr = (&p->se)->cfs_rq->curr; struct sched_entity *curr = (&p->se)->cfs_rq->curr; #else #else struct sched_entity *curr = (&task_rq(p)->cfs)->curr; struct sched_entity *curr = (&task_rq(p)->cfs)->curr; #endif #endif prefetch(curr); prefetch(curr); prefetch(&curr->exec_start); prefetch(&curr->exec_start); } } /* /* * Return accounted runtime for the task. * Return accounted runtime for the task. * In case the task is currently running, return the runtime * In case the task is currently running, return the runtime * pending runtime that have not been accounted yet. * pending runtime that have not been accounted yet. */ */ unsigned long long task_sched_runtime(struct task_struct *p) unsigned long long task_sched_runtime(struct task_struct *p) { { struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; u64 ns; u64 ns; #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) /* /* * 64-bit doesn't need locks to atomically read a 64- * 64-bit doesn't need locks to atomically read a 64- * So we have a optimization chance when the task's d * So we have a optimization chance when the task's d * Reading ->on_cpu is racy, but this is ok. * Reading ->on_cpu is racy, but this is ok. * * * If we race with it leaving CPU, we'll take a lock. * If we race with it leaving CPU, we'll take a lock. * If we race with it entering CPU, unaccounted time * If we race with it entering CPU, unaccounted time * indistinguishable from the read occurring a few cy * indistinguishable from the read occurring a few cy * If we see ->on_cpu without ->on_rq, the task is le * If we see ->on_cpu without ->on_rq, the task is le * been accounted, so we're correct here as well. * been accounted, so we're correct here as well. */ */ if (!p->on_cpu || !task_on_rq_queued(p)) if (!p->on_cpu || !task_on_rq_queued(p)) return p->se.sum_exec_runtime; return p->se.sum_exec_runtime; #endif #endif rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); /* /* * Must be ->curr _and_ ->on_rq. If dequeued, we wou * Must be ->curr _and_ ->on_rq. If dequeued, we wou * project cycles that may never be accounted to this * project cycles that may never be accounted to this * thread, breaking clock_gettime(). * thread, breaking clock_gettime(). */ */ if (task_current(rq, p) && task_on_rq_queued(p)) { if (task_current(rq, p) && task_on_rq_queued(p)) { prefetch_curr_exec_start(p); prefetch_curr_exec_start(p); update_rq_clock(rq); update_rq_clock(rq); p->sched_class->update_curr(rq); p->sched_class->update_curr(rq); } } ns = p->se.sum_exec_runtime; ns = p->se.sum_exec_runtime; task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); return ns; return ns; } } #ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG static u64 cpu_resched_latency(struct rq *rq) static u64 cpu_resched_latency(struct rq *rq) { { int latency_warn_ms = READ_ONCE(sysctl_resched_latenc int latency_warn_ms = READ_ONCE(sysctl_resched_latenc u64 resched_latency, now = rq_clock(rq); u64 resched_latency, now = rq_clock(rq); static bool warned_once; static bool warned_once; if (sysctl_resched_latency_warn_once && warned_once) if (sysctl_resched_latency_warn_once && warned_once) return 0; return 0; if (!need_resched() || !latency_warn_ms) if (!need_resched() || !latency_warn_ms) return 0; return 0; if (system_state == SYSTEM_BOOTING) if (system_state == SYSTEM_BOOTING) return 0; return 0; if (!rq->last_seen_need_resched_ns) { if (!rq->last_seen_need_resched_ns) { rq->last_seen_need_resched_ns = now; rq->last_seen_need_resched_ns = now; rq->ticks_without_resched = 0; rq->ticks_without_resched = 0; return 0; return 0; } } rq->ticks_without_resched++; rq->ticks_without_resched++; resched_latency = now - rq->last_seen_need_resched_ns resched_latency = now - rq->last_seen_need_resched_ns if (resched_latency <= latency_warn_ms * NSEC_PER_MSE if (resched_latency <= latency_warn_ms * NSEC_PER_MSE return 0; return 0; warned_once = true; warned_once = true; return resched_latency; return resched_latency; } } static int __init setup_resched_latency_warn_ms(char *str) static int __init setup_resched_latency_warn_ms(char *str) { { long val; long val; if ((kstrtol(str, 0, &val))) { if ((kstrtol(str, 0, &val))) { pr_warn("Unable to set resched_latency_warn_m pr_warn("Unable to set resched_latency_warn_m return 1; return 1; } } sysctl_resched_latency_warn_ms = val; sysctl_resched_latency_warn_ms = val; return 1; return 1; } } __setup("resched_latency_warn_ms=", setup_resched_latency_war __setup("resched_latency_warn_ms=", setup_resched_latency_war #else #else static inline u64 cpu_resched_latency(struct rq *rq) { return static inline u64 cpu_resched_latency(struct rq *rq) { return #endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_SCHED_DEBUG */ /* /* * This function gets called by the timer code, with HZ frequ * This function gets called by the timer code, with HZ frequ * We call it with interrupts disabled. * We call it with interrupts disabled. */ */ void scheduler_tick(void) void scheduler_tick(void) { { int cpu = smp_processor_id(); int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr; struct rq_flags rf; struct rq_flags rf; unsigned long thermal_pressure; unsigned long thermal_pressure; u64 resched_latency; u64 resched_latency; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) if (housekeeping_cpu(cpu, HK_TYPE_TICK)) arch_scale_freq_tick(); arch_scale_freq_tick(); sched_clock_tick(); sched_clock_tick(); rq_lock(rq, &rf); rq_lock(rq, &rf); update_rq_clock(rq); update_rq_clock(rq); thermal_pressure = arch_scale_thermal_pressure(cpu_of thermal_pressure = arch_scale_thermal_pressure(cpu_of update_thermal_load_avg(rq_clock_thermal(rq), rq, the update_thermal_load_avg(rq_clock_thermal(rq), rq, the curr->sched_class->task_tick(rq, curr, 0); curr->sched_class->task_tick(rq, curr, 0); if (sched_feat(LATENCY_WARN)) if (sched_feat(LATENCY_WARN)) resched_latency = cpu_resched_latency(rq); resched_latency = cpu_resched_latency(rq); calc_global_load_tick(rq); calc_global_load_tick(rq); sched_core_tick(rq); sched_core_tick(rq); task_tick_mm_cid(rq, curr); task_tick_mm_cid(rq, curr); rq_unlock(rq, &rf); rq_unlock(rq, &rf); if (sched_feat(LATENCY_WARN) && resched_latency) if (sched_feat(LATENCY_WARN) && resched_latency) resched_latency_warn(cpu, resched_latency); resched_latency_warn(cpu, resched_latency); perf_event_task_tick(); perf_event_task_tick(); if (curr->flags & PF_WQ_WORKER) if (curr->flags & PF_WQ_WORKER) wq_worker_tick(curr); wq_worker_tick(curr); #ifdef CONFIG_SMP #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); trigger_load_balance(rq); #endif #endif } } #ifdef CONFIG_NO_HZ_FULL #ifdef CONFIG_NO_HZ_FULL struct tick_work { struct tick_work { int cpu; int cpu; atomic_t state; atomic_t state; struct delayed_work work; struct delayed_work work; }; }; /* Values for ->state, see diagram below. */ /* Values for ->state, see diagram below. */ #define TICK_SCHED_REMOTE_OFFLINE 0 #define TICK_SCHED_REMOTE_OFFLINE 0 #define TICK_SCHED_REMOTE_OFFLINING 1 #define TICK_SCHED_REMOTE_OFFLINING 1 #define TICK_SCHED_REMOTE_RUNNING 2 #define TICK_SCHED_REMOTE_RUNNING 2 /* /* * State diagram for ->state: * State diagram for ->state: * * * * * TICK_SCHED_REMOTE_OFFLINE * TICK_SCHED_REMOTE_OFFLINE * | ^ * | ^ * | | * | | * | | sched_tick_remote() * | | sched_tick_remote() * | | * | | * | | * | | * +--TICK_SCHED_REMOTE_OFFLINING * +--TICK_SCHED_REMOTE_OFFLINING * | ^ * | ^ * | | * | | * sched_tick_start() | | sched_tick_stop() * sched_tick_start() | | sched_tick_stop() * | | * | | * V | * V | * TICK_SCHED_REMOTE_RUNNING * TICK_SCHED_REMOTE_RUNNING * * * * * Other transitions get WARN_ON_ONCE(), except that sched_ti * Other transitions get WARN_ON_ONCE(), except that sched_ti * and sched_tick_start() are happy to leave the state in RUN * and sched_tick_start() are happy to leave the state in RUN */ */ static struct tick_work __percpu *tick_work_cpu; static struct tick_work __percpu *tick_work_cpu; static void sched_tick_remote(struct work_struct *work) static void sched_tick_remote(struct work_struct *work) { { struct delayed_work *dwork = to_delayed_work(work); struct delayed_work *dwork = to_delayed_work(work); struct tick_work *twork = container_of(dwork, struct struct tick_work *twork = container_of(dwork, struct int cpu = twork->cpu; int cpu = twork->cpu; struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); int os; int os; /* /* * Handle the tick only if it appears the remote CPU * Handle the tick only if it appears the remote CPU * dynticks mode. The check is racy by nature, but mi * dynticks mode. The check is racy by nature, but mi * having one too much is no big deal because the sch * having one too much is no big deal because the sch * statistics and checks timeslices in a time-indepen * statistics and checks timeslices in a time-indepen * of when exactly it is running. * of when exactly it is running. */ */ if (tick_nohz_tick_stopped_cpu(cpu)) { if (tick_nohz_tick_stopped_cpu(cpu)) { guard(rq_lock_irq)(rq); guard(rq_lock_irq)(rq); struct task_struct *curr = rq->curr; struct task_struct *curr = rq->curr; if (cpu_online(cpu)) { if (cpu_online(cpu)) { update_rq_clock(rq); update_rq_clock(rq); if (!is_idle_task(curr)) { if (!is_idle_task(curr)) { /* /* * Make sure the next tick ru * Make sure the next tick ru * reasonable amount of time. * reasonable amount of time. */ */ u64 delta = rq_clock_task(rq) u64 delta = rq_clock_task(rq) WARN_ON_ONCE(delta > (u64)NSE WARN_ON_ONCE(delta > (u64)NSE } } curr->sched_class->task_tick(rq, curr curr->sched_class->task_tick(rq, curr calc_load_nohz_remote(rq); calc_load_nohz_remote(rq); } } } } /* /* * Run the remote tick once per second (1Hz). This ar * Run the remote tick once per second (1Hz). This ar * frequency is large enough to avoid overload but sh * frequency is large enough to avoid overload but sh * to keep scheduler internal stats reasonably up to * to keep scheduler internal stats reasonably up to * first update state to reflect hotplug activity if * first update state to reflect hotplug activity if */ */ os = atomic_fetch_add_unless(&twork->state, -1, TICK_ os = atomic_fetch_add_unless(&twork->state, -1, TICK_ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); if (os == TICK_SCHED_REMOTE_RUNNING) if (os == TICK_SCHED_REMOTE_RUNNING) queue_delayed_work(system_unbound_wq, dwork, queue_delayed_work(system_unbound_wq, dwork, } } static void sched_tick_start(int cpu) static void sched_tick_start(int cpu) { { int os; int os; struct tick_work *twork; struct tick_work *twork; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; return; WARN_ON_ONCE(!tick_work_cpu); WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUN os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUN WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); if (os == TICK_SCHED_REMOTE_OFFLINE) { if (os == TICK_SCHED_REMOTE_OFFLINE) { twork->cpu = cpu; twork->cpu = cpu; INIT_DELAYED_WORK(&twork->work, sched_tick_re INIT_DELAYED_WORK(&twork->work, sched_tick_re queue_delayed_work(system_unbound_wq, &twork- queue_delayed_work(system_unbound_wq, &twork- } } } } #ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU static void sched_tick_stop(int cpu) static void sched_tick_stop(int cpu) { { struct tick_work *twork; struct tick_work *twork; int os; int os; if (housekeeping_cpu(cpu, HK_TYPE_TICK)) if (housekeeping_cpu(cpu, HK_TYPE_TICK)) return; return; WARN_ON_ONCE(!tick_work_cpu); WARN_ON_ONCE(!tick_work_cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); twork = per_cpu_ptr(tick_work_cpu, cpu); /* There cannot be competing actions, but don't rely /* There cannot be competing actions, but don't rely os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFF os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFF WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); /* Don't cancel, as this would mess up the state mach /* Don't cancel, as this would mess up the state mach } } #endif /* CONFIG_HOTPLUG_CPU */ #endif /* CONFIG_HOTPLUG_CPU */ int __init sched_tick_offload_init(void) int __init sched_tick_offload_init(void) { { tick_work_cpu = alloc_percpu(struct tick_work); tick_work_cpu = alloc_percpu(struct tick_work); BUG_ON(!tick_work_cpu); BUG_ON(!tick_work_cpu); return 0; return 0; } } #else /* !CONFIG_NO_HZ_FULL */ #else /* !CONFIG_NO_HZ_FULL */ static inline void sched_tick_start(int cpu) { } static inline void sched_tick_start(int cpu) { } static inline void sched_tick_stop(int cpu) { } static inline void sched_tick_stop(int cpu) { } #endif #endif #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEM #if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEM defined(CONFIG_TRACE_PREEMPT_ defined(CONFIG_TRACE_PREEMPT_ /* /* * If the value passed in is equal to the current preempt cou * If the value passed in is equal to the current preempt cou * then we just disabled preemption. Start timing the latency * then we just disabled preemption. Start timing the latency */ */ static inline void preempt_latency_start(int val) static inline void preempt_latency_start(int val) { { if (preempt_count() == val) { if (preempt_count() == val) { unsigned long ip = get_lock_parent_ip(); unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; current->preempt_disable_ip = ip; #endif #endif trace_preempt_off(CALLER_ADDR0, ip); trace_preempt_off(CALLER_ADDR0, ip); } } } } void preempt_count_add(int val) void preempt_count_add(int val) { { #ifdef CONFIG_DEBUG_PREEMPT #ifdef CONFIG_DEBUG_PREEMPT /* /* * Underflow? * Underflow? */ */ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) return; return; #endif #endif __preempt_count_add(val); __preempt_count_add(val); #ifdef CONFIG_DEBUG_PREEMPT #ifdef CONFIG_DEBUG_PREEMPT /* /* * Spinlock count overflowing soon? * Spinlock count overflowing soon? */ */ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) PREEMPT_MASK - 10); PREEMPT_MASK - 10); #endif #endif preempt_latency_start(val); preempt_latency_start(val); } } EXPORT_SYMBOL(preempt_count_add); EXPORT_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); NOKPROBE_SYMBOL(preempt_count_add); /* /* * If the value passed in equals to the current preempt count * If the value passed in equals to the current preempt count * then we just enabled preemption. Stop timing the latency. * then we just enabled preemption. Stop timing the latency. */ */ static inline void preempt_latency_stop(int val) static inline void preempt_latency_stop(int val) { { if (preempt_count() == val) if (preempt_count() == val) trace_preempt_on(CALLER_ADDR0, get_lock_paren trace_preempt_on(CALLER_ADDR0, get_lock_paren } } void preempt_count_sub(int val) void preempt_count_sub(int val) { { #ifdef CONFIG_DEBUG_PREEMPT #ifdef CONFIG_DEBUG_PREEMPT /* /* * Underflow? * Underflow? */ */ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) return; return; /* /* * Is the spinlock portion underflowing? * Is the spinlock portion underflowing? */ */ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK))) !(preempt_count() & PREEMPT_MASK))) return; return; #endif #endif preempt_latency_stop(val); preempt_latency_stop(val); __preempt_count_sub(val); __preempt_count_sub(val); } } EXPORT_SYMBOL(preempt_count_sub); EXPORT_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); NOKPROBE_SYMBOL(preempt_count_sub); #else #else static inline void preempt_latency_start(int val) { } static inline void preempt_latency_start(int val) { } static inline void preempt_latency_stop(int val) { } static inline void preempt_latency_stop(int val) { } #endif #endif static inline unsigned long get_preempt_disable_ip(struct tas static inline unsigned long get_preempt_disable_ip(struct tas { { #ifdef CONFIG_DEBUG_PREEMPT #ifdef CONFIG_DEBUG_PREEMPT return p->preempt_disable_ip; return p->preempt_disable_ip; #else #else return 0; return 0; #endif #endif } } /* /* * Print scheduling while atomic bug: * Print scheduling while atomic bug: */ */ static noinline void __schedule_bug(struct task_struct *prev) static noinline void __schedule_bug(struct task_struct *prev) { { /* Save this before calling printk(), since that will /* Save this before calling printk(), since that will unsigned long preempt_disable_ip = get_preempt_disabl unsigned long preempt_disable_ip = get_preempt_disabl if (oops_in_progress) if (oops_in_progress) return; return; printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/ prev->comm, prev->pid, preempt_count()); prev->comm, prev->pid, preempt_count()); debug_show_held_locks(prev); debug_show_held_locks(prev); print_modules(); print_modules(); if (irqs_disabled()) if (irqs_disabled()) print_irqtrace_events(prev); print_irqtrace_events(prev); if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) { && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); print_ip_sym(KERN_ERR, preempt_disable_ip); } } check_panic_on_warn("scheduling while atomic"); check_panic_on_warn("scheduling while atomic"); dump_stack(); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } } /* /* * Various schedule()-time debugging checks and statistics: * Various schedule()-time debugging checks and statistics: */ */ static inline void schedule_debug(struct task_struct *prev, b static inline void schedule_debug(struct task_struct *prev, b { { #ifdef CONFIG_SCHED_STACK_END_CHECK #ifdef CONFIG_SCHED_STACK_END_CHECK if (task_stack_end_corrupted(prev)) if (task_stack_end_corrupted(prev)) panic("corrupted stack end detected inside sc panic("corrupted stack end detected inside sc if (task_scs_end_corrupted(prev)) if (task_scs_end_corrupted(prev)) panic("corrupted shadow stack detected inside panic("corrupted shadow stack detected inside #endif #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP #ifdef CONFIG_DEBUG_ATOMIC_SLEEP if (!preempt && READ_ONCE(prev->__state) && prev->non if (!preempt && READ_ONCE(prev->__state) && prev->non printk(KERN_ERR "BUG: scheduling in a non-blo printk(KERN_ERR "BUG: scheduling in a non-blo prev->comm, prev->pid, prev->non_bloc prev->comm, prev->pid, prev->non_bloc dump_stack(); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } } #endif #endif if (unlikely(in_atomic_preempt_off())) { if (unlikely(in_atomic_preempt_off())) { __schedule_bug(prev); __schedule_bug(prev); preempt_count_set(PREEMPT_DISABLED); preempt_count_set(PREEMPT_DISABLED); } } rcu_sleep_check(); rcu_sleep_check(); SCHED_WARN_ON(ct_state() == CONTEXT_USER); SCHED_WARN_ON(ct_state() == CONTEXT_USER); profile_hit(SCHED_PROFILING, __builtin_return_address profile_hit(SCHED_PROFILING, __builtin_return_address schedstat_inc(this_rq()->sched_count); schedstat_inc(this_rq()->sched_count); } } static void put_prev_task_balance(struct rq *rq, struct task_ static void put_prev_task_balance(struct rq *rq, struct task_ struct rq_flags *rf) struct rq_flags *rf) { { #ifdef CONFIG_SMP #ifdef CONFIG_SMP const struct sched_class *class; const struct sched_class *class; /* /* * We must do the balancing pass before put_prev_task * We must do the balancing pass before put_prev_task * that when we release the rq->lock the task is in t * that when we release the rq->lock the task is in t * state as before we took rq->lock. * state as before we took rq->lock. * * * We can terminate the balance pass as soon as we kn * We can terminate the balance pass as soon as we kn * a runnable task of @class priority or higher. * a runnable task of @class priority or higher. */ */ for_class_range(class, prev->sched_class, &idle_sched for_class_range(class, prev->sched_class, &idle_sched if (class->balance(rq, prev, rf)) if (class->balance(rq, prev, rf)) break; break; } } #endif #endif put_prev_task(rq, prev); put_prev_task(rq, prev); } } /* /* * Pick up the highest-prio task: * Pick up the highest-prio task: */ */ static inline struct task_struct * static inline struct task_struct * __pick_next_task(struct rq *rq, struct task_struct *prev, str __pick_next_task(struct rq *rq, struct task_struct *prev, str { { const struct sched_class *class; const struct sched_class *class; struct task_struct *p; struct task_struct *p; /* /* * Optimization: we know that if all tasks are in the * Optimization: we know that if all tasks are in the * call that function directly, but only if the @prev * call that function directly, but only if the @prev * higher scheduling class, because otherwise those l * higher scheduling class, because otherwise those l * opportunity to pull in more work from other CPUs. * opportunity to pull in more work from other CPUs. */ */ if (likely(!sched_class_above(prev->sched_class, &fai if (likely(!sched_class_above(prev->sched_class, &fai rq->nr_running == rq->cfs.h_nr_running)) { rq->nr_running == rq->cfs.h_nr_running)) { p = pick_next_task_fair(rq, prev, rf); p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) if (unlikely(p == RETRY_TASK)) goto restart; goto restart; /* Assume the next prioritized class is idle_ /* Assume the next prioritized class is idle_ if (!p) { if (!p) { put_prev_task(rq, prev); put_prev_task(rq, prev); p = pick_next_task_idle(rq); p = pick_next_task_idle(rq); } } return p; return p; } } restart: restart: put_prev_task_balance(rq, prev, rf); put_prev_task_balance(rq, prev, rf); for_each_class(class) { for_each_class(class) { p = class->pick_next_task(rq); p = class->pick_next_task(rq); if (p) if (p) return p; return p; } } BUG(); /* The idle class should always have a runnabl BUG(); /* The idle class should always have a runnabl } } #ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE static inline bool is_task_rq_idle(struct task_struct *t) static inline bool is_task_rq_idle(struct task_struct *t) { { return (task_rq(t)->idle == t); return (task_rq(t)->idle == t); } } static inline bool cookie_equals(struct task_struct *a, unsig static inline bool cookie_equals(struct task_struct *a, unsig { { return is_task_rq_idle(a) || (a->core_cookie == cooki return is_task_rq_idle(a) || (a->core_cookie == cooki } } static inline bool cookie_match(struct task_struct *a, struct static inline bool cookie_match(struct task_struct *a, struct { { if (is_task_rq_idle(a) || is_task_rq_idle(b)) if (is_task_rq_idle(a) || is_task_rq_idle(b)) return true; return true; return a->core_cookie == b->core_cookie; return a->core_cookie == b->core_cookie; } } static inline struct task_struct *pick_task(struct rq *rq) static inline struct task_struct *pick_task(struct rq *rq) { { const struct sched_class *class; const struct sched_class *class; struct task_struct *p; struct task_struct *p; for_each_class(class) { for_each_class(class) { p = class->pick_task(rq); p = class->pick_task(rq); if (p) if (p) return p; return p; } } BUG(); /* The idle class should always have a runnabl BUG(); /* The idle class should always have a runnabl } } extern void task_vruntime_update(struct rq *rq, struct task_s extern void task_vruntime_update(struct rq *rq, struct task_s static void queue_core_balance(struct rq *rq); static void queue_core_balance(struct rq *rq); static struct task_struct * static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struc pick_next_task(struct rq *rq, struct task_struct *prev, struc { { struct task_struct *next, *p, *max = NULL; struct task_struct *next, *p, *max = NULL; const struct cpumask *smt_mask; const struct cpumask *smt_mask; bool fi_before = false; bool fi_before = false; bool core_clock_updated = (rq == rq->core); bool core_clock_updated = (rq == rq->core); unsigned long cookie; unsigned long cookie; int i, cpu, occ = 0; int i, cpu, occ = 0; struct rq *rq_i; struct rq *rq_i; bool need_sync; bool need_sync; if (!sched_core_enabled(rq)) if (!sched_core_enabled(rq)) return __pick_next_task(rq, prev, rf); return __pick_next_task(rq, prev, rf); cpu = cpu_of(rq); cpu = cpu_of(rq); /* Stopper task is switching into idle, no need core- /* Stopper task is switching into idle, no need core- if (cpu_is_offline(cpu)) { if (cpu_is_offline(cpu)) { /* /* * Reset core_pick so that we don't enter the * Reset core_pick so that we don't enter the * coming online. core_pick would already be * coming online. core_pick would already be * another cpu during offline. * another cpu during offline. */ */ rq->core_pick = NULL; rq->core_pick = NULL; return __pick_next_task(rq, prev, rf); return __pick_next_task(rq, prev, rf); } } /* /* * If there were no {en,de}queues since we picked (IO * If there were no {en,de}queues since we picked (IO * pointers are all still valid), and we haven't sche * pointers are all still valid), and we haven't sche * pick yet, do so now. * pick yet, do so now. * * * rq->core_pick can be NULL if no selection was made * rq->core_pick can be NULL if no selection was made * it was either offline or went offline during a sib * it was either offline or went offline during a sib * selection. In this case, do a core-wide selection. * selection. In this case, do a core-wide selection. */ */ if (rq->core->core_pick_seq == rq->core->core_task_se if (rq->core->core_pick_seq == rq->core->core_task_se rq->core->core_pick_seq != rq->core_sched_seq && rq->core->core_pick_seq != rq->core_sched_seq && rq->core_pick) { rq->core_pick) { WRITE_ONCE(rq->core_sched_seq, rq->core->core WRITE_ONCE(rq->core_sched_seq, rq->core->core next = rq->core_pick; next = rq->core_pick; if (next != prev) { if (next != prev) { put_prev_task(rq, prev); put_prev_task(rq, prev); set_next_task(rq, next); set_next_task(rq, next); } } rq->core_pick = NULL; rq->core_pick = NULL; goto out; goto out; } } put_prev_task_balance(rq, prev, rf); put_prev_task_balance(rq, prev, rf); smt_mask = cpu_smt_mask(cpu); smt_mask = cpu_smt_mask(cpu); need_sync = !!rq->core->core_cookie; need_sync = !!rq->core->core_cookie; /* reset state */ /* reset state */ rq->core->core_cookie = 0UL; rq->core->core_cookie = 0UL; if (rq->core->core_forceidle_count) { if (rq->core->core_forceidle_count) { if (!core_clock_updated) { if (!core_clock_updated) { update_rq_clock(rq->core); update_rq_clock(rq->core); core_clock_updated = true; core_clock_updated = true; } } sched_core_account_forceidle(rq); sched_core_account_forceidle(rq); /* reset after accounting force idle */ /* reset after accounting force idle */ rq->core->core_forceidle_start = 0; rq->core->core_forceidle_start = 0; rq->core->core_forceidle_count = 0; rq->core->core_forceidle_count = 0; rq->core->core_forceidle_occupation = 0; rq->core->core_forceidle_occupation = 0; need_sync = true; need_sync = true; fi_before = true; fi_before = true; } } /* /* * core->core_task_seq, core->core_pick_seq, rq->core * core->core_task_seq, core->core_pick_seq, rq->core * * * @task_seq guards the task state ({en,de}queues) * @task_seq guards the task state ({en,de}queues) * @pick_seq is the @task_seq we did a selection on * @pick_seq is the @task_seq we did a selection on * @sched_seq is the @pick_seq we scheduled * @sched_seq is the @pick_seq we scheduled * * * However, preemptions can cause multiple picks on t * However, preemptions can cause multiple picks on t * 'Fix' this by also increasing @task_seq for every * 'Fix' this by also increasing @task_seq for every */ */ rq->core->core_task_seq++; rq->core->core_task_seq++; /* /* * Optimize for common case where this CPU has no coo * Optimize for common case where this CPU has no coo * and there are no cookied tasks running on siblings * and there are no cookied tasks running on siblings */ */ if (!need_sync) { if (!need_sync) { next = pick_task(rq); next = pick_task(rq); if (!next->core_cookie) { if (!next->core_cookie) { rq->core_pick = NULL; rq->core_pick = NULL; /* /* * For robustness, update the min_vru * For robustness, update the min_vru * unconstrained picks as well. * unconstrained picks as well. */ */ WARN_ON_ONCE(fi_before); WARN_ON_ONCE(fi_before); task_vruntime_update(rq, next, false) task_vruntime_update(rq, next, false) goto out_set_next; goto out_set_next; } } } } /* /* * For each thread: do the regular task pick and find * For each thread: do the regular task pick and find * amongst them. * amongst them. * * * Tie-break prio towards the current CPU * Tie-break prio towards the current CPU */ */ for_each_cpu_wrap(i, smt_mask, cpu) { for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); rq_i = cpu_rq(i); /* /* * Current cpu always has its clock updated o * Current cpu always has its clock updated o * pick_next_task(). If the current cpu is no * pick_next_task(). If the current cpu is no * the core may also have been updated above. * the core may also have been updated above. */ */ if (i != cpu && (rq_i != rq->core || !core_cl if (i != cpu && (rq_i != rq->core || !core_cl update_rq_clock(rq_i); update_rq_clock(rq_i); p = rq_i->core_pick = pick_task(rq_i); p = rq_i->core_pick = pick_task(rq_i); if (!max || prio_less(max, p, fi_before)) if (!max || prio_less(max, p, fi_before)) max = p; max = p; } } cookie = rq->core->core_cookie = max->core_cookie; cookie = rq->core->core_cookie = max->core_cookie; /* /* * For each thread: try and find a runnable task that * For each thread: try and find a runnable task that * force idle. * force idle. */ */ for_each_cpu(i, smt_mask) { for_each_cpu(i, smt_mask) { rq_i = cpu_rq(i); rq_i = cpu_rq(i); p = rq_i->core_pick; p = rq_i->core_pick; if (!cookie_equals(p, cookie)) { if (!cookie_equals(p, cookie)) { p = NULL; p = NULL; if (cookie) if (cookie) p = sched_core_find(rq_i, coo p = sched_core_find(rq_i, coo if (!p) if (!p) p = idle_sched_class.pick_tas p = idle_sched_class.pick_tas } } rq_i->core_pick = p; rq_i->core_pick = p; if (p == rq_i->idle) { if (p == rq_i->idle) { if (rq_i->nr_running) { if (rq_i->nr_running) { rq->core->core_forceidle_coun rq->core->core_forceidle_coun if (!fi_before) if (!fi_before) rq->core->core_forcei rq->core->core_forcei } } } else { } else { occ++; occ++; } } } } if (schedstat_enabled() && rq->core->core_forceidle_c if (schedstat_enabled() && rq->core->core_forceidle_c rq->core->core_forceidle_start = rq_clock(rq- rq->core->core_forceidle_start = rq_clock(rq- rq->core->core_forceidle_occupation = occ; rq->core->core_forceidle_occupation = occ; } } rq->core->core_pick_seq = rq->core->core_task_seq; rq->core->core_pick_seq = rq->core->core_task_seq; next = rq->core_pick; next = rq->core_pick; rq->core_sched_seq = rq->core->core_pick_seq; rq->core_sched_seq = rq->core->core_pick_seq; /* Something should have been selected for current CP /* Something should have been selected for current CP WARN_ON_ONCE(!next); WARN_ON_ONCE(!next); /* /* * Reschedule siblings * Reschedule siblings * * * NOTE: L1TF -- at this point we're no longer runnin * NOTE: L1TF -- at this point we're no longer runnin * sending an IPI (below) ensures the sibling will no * sending an IPI (below) ensures the sibling will no * their task. This ensures there is no inter-sibling * their task. This ensures there is no inter-sibling * non-matching user state. * non-matching user state. */ */ for_each_cpu(i, smt_mask) { for_each_cpu(i, smt_mask) { rq_i = cpu_rq(i); rq_i = cpu_rq(i); /* /* * An online sibling might have gone offline * An online sibling might have gone offline * could be picked for it, or it might be off * could be picked for it, or it might be off * happen to come online, but its too late an * happen to come online, but its too late an * picked for it. That's Ok - it will pick t * picked for it. That's Ok - it will pick t * so ignore it. * so ignore it. */ */ if (!rq_i->core_pick) if (!rq_i->core_pick) continue; continue; /* /* * Update for new !FI->FI transitions, or if * Update for new !FI->FI transitions, or if * fi_before fi update? * fi_before fi update? * 0 0 1 * 0 0 1 * 0 1 1 * 0 1 1 * 1 0 1 * 1 0 1 * 1 1 0 * 1 1 0 */ */ if (!(fi_before && rq->core->core_forceidle_c if (!(fi_before && rq->core->core_forceidle_c task_vruntime_update(rq_i, rq_i->core task_vruntime_update(rq_i, rq_i->core rq_i->core_pick->core_occupation = occ; rq_i->core_pick->core_occupation = occ; if (i == cpu) { if (i == cpu) { rq_i->core_pick = NULL; rq_i->core_pick = NULL; continue; continue; } } /* Did we break L1TF mitigation requirements? /* Did we break L1TF mitigation requirements? WARN_ON_ONCE(!cookie_match(next, rq_i->core_p WARN_ON_ONCE(!cookie_match(next, rq_i->core_p if (rq_i->curr == rq_i->core_pick) { if (rq_i->curr == rq_i->core_pick) { rq_i->core_pick = NULL; rq_i->core_pick = NULL; continue; continue; } } resched_curr(rq_i); resched_curr(rq_i); } } out_set_next: out_set_next: set_next_task(rq, next); set_next_task(rq, next); out: out: if (rq->core->core_forceidle_count && next == rq->idl if (rq->core->core_forceidle_count && next == rq->idl queue_core_balance(rq); queue_core_balance(rq); return next; return next; } } static bool try_steal_cookie(int this, int that) static bool try_steal_cookie(int this, int that) { { struct rq *dst = cpu_rq(this), *src = cpu_rq(that); struct rq *dst = cpu_rq(this), *src = cpu_rq(that); struct task_struct *p; struct task_struct *p; unsigned long cookie; unsigned long cookie; bool success = false; bool success = false; guard(irq)(); guard(irq)(); guard(double_rq_lock)(dst, src); guard(double_rq_lock)(dst, src); cookie = dst->core->core_cookie; cookie = dst->core->core_cookie; if (!cookie) if (!cookie) return false; return false; if (dst->curr != dst->idle) if (dst->curr != dst->idle) return false; return false; p = sched_core_find(src, cookie); p = sched_core_find(src, cookie); if (!p) if (!p) return false; return false; do { do { if (p == src->core_pick || p == src->curr) if (p == src->core_pick || p == src->curr) goto next; goto next; if (!is_cpu_allowed(p, this)) if (!is_cpu_allowed(p, this)) goto next; goto next; if (p->core_occupation > dst->idle->core_occu if (p->core_occupation > dst->idle->core_occu goto next; goto next; /* /* * sched_core_find() and sched_core_next() wi * sched_core_find() and sched_core_next() wi * that task @p is not throttled now, we also * that task @p is not throttled now, we also * check whether the runqueue of the destinat * check whether the runqueue of the destinat * being throttled. * being throttled. */ */ if (sched_task_is_throttled(p, this)) if (sched_task_is_throttled(p, this)) goto next; goto next; deactivate_task(src, p, 0); deactivate_task(src, p, 0); set_task_cpu(p, this); set_task_cpu(p, this); activate_task(dst, p, 0); activate_task(dst, p, 0); resched_curr(dst); resched_curr(dst); success = true; success = true; break; break; next: next: p = sched_core_next(p, cookie); p = sched_core_next(p, cookie); } while (p); } while (p); return success; return success; } } static bool steal_cookie_task(int cpu, struct sched_domain *s static bool steal_cookie_task(int cpu, struct sched_domain *s { { int i; int i; for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) if (i == cpu) if (i == cpu) continue; continue; if (need_resched()) if (need_resched()) break; break; if (try_steal_cookie(cpu, i)) if (try_steal_cookie(cpu, i)) return true; return true; } } return false; return false; } } static void sched_core_balance(struct rq *rq) static void sched_core_balance(struct rq *rq) { { struct sched_domain *sd; struct sched_domain *sd; int cpu = cpu_of(rq); int cpu = cpu_of(rq); preempt_disable(); preempt_disable(); rcu_read_lock(); rcu_read_lock(); raw_spin_rq_unlock_irq(rq); raw_spin_rq_unlock_irq(rq); for_each_domain(cpu, sd) { for_each_domain(cpu, sd) { if (need_resched()) if (need_resched()) break; break; if (steal_cookie_task(cpu, sd)) if (steal_cookie_task(cpu, sd)) break; break; } } raw_spin_rq_lock_irq(rq); raw_spin_rq_lock_irq(rq); rcu_read_unlock(); rcu_read_unlock(); preempt_enable(); preempt_enable(); } } static DEFINE_PER_CPU(struct balance_callback, core_balance_h static DEFINE_PER_CPU(struct balance_callback, core_balance_h static void queue_core_balance(struct rq *rq) static void queue_core_balance(struct rq *rq) { { if (!sched_core_enabled(rq)) if (!sched_core_enabled(rq)) return; return; if (!rq->core->core_cookie) if (!rq->core->core_cookie) return; return; if (!rq->nr_running) /* not forced idle */ if (!rq->nr_running) /* not forced idle */ return; return; queue_balance_callback(rq, &per_cpu(core_balance_head queue_balance_callback(rq, &per_cpu(core_balance_head } } DEFINE_LOCK_GUARD_1(core_lock, int, DEFINE_LOCK_GUARD_1(core_lock, int, sched_core_lock(*_T->lock, &_T->flags), sched_core_lock(*_T->lock, &_T->flags), sched_core_unlock(*_T->lock, &_T->flags), sched_core_unlock(*_T->lock, &_T->flags), unsigned long flags) unsigned long flags) static void sched_core_cpu_starting(unsigned int cpu) static void sched_core_cpu_starting(unsigned int cpu) { { const struct cpumask *smt_mask = cpu_smt_mask(cpu); const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; struct rq *rq = cpu_rq(cpu), *core_rq = NULL; int t; int t; guard(core_lock)(&cpu); guard(core_lock)(&cpu); WARN_ON_ONCE(rq->core != rq); WARN_ON_ONCE(rq->core != rq); /* if we're the first, we'll be our own leader */ /* if we're the first, we'll be our own leader */ if (cpumask_weight(smt_mask) == 1) if (cpumask_weight(smt_mask) == 1) return; return; /* find the leader */ /* find the leader */ for_each_cpu(t, smt_mask) { for_each_cpu(t, smt_mask) { if (t == cpu) if (t == cpu) continue; continue; rq = cpu_rq(t); rq = cpu_rq(t); if (rq->core == rq) { if (rq->core == rq) { core_rq = rq; core_rq = rq; break; break; } } } } if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */ return; return; /* install and validate core_rq */ /* install and validate core_rq */ for_each_cpu(t, smt_mask) { for_each_cpu(t, smt_mask) { rq = cpu_rq(t); rq = cpu_rq(t); if (t == cpu) if (t == cpu) rq->core = core_rq; rq->core = core_rq; WARN_ON_ONCE(rq->core != core_rq); WARN_ON_ONCE(rq->core != core_rq); } } } } static void sched_core_cpu_deactivate(unsigned int cpu) static void sched_core_cpu_deactivate(unsigned int cpu) { { const struct cpumask *smt_mask = cpu_smt_mask(cpu); const struct cpumask *smt_mask = cpu_smt_mask(cpu); struct rq *rq = cpu_rq(cpu), *core_rq = NULL; struct rq *rq = cpu_rq(cpu), *core_rq = NULL; int t; int t; guard(core_lock)(&cpu); guard(core_lock)(&cpu); /* if we're the last man standing, nothing to do */ /* if we're the last man standing, nothing to do */ if (cpumask_weight(smt_mask) == 1) { if (cpumask_weight(smt_mask) == 1) { WARN_ON_ONCE(rq->core != rq); WARN_ON_ONCE(rq->core != rq); return; return; } } /* if we're not the leader, nothing to do */ /* if we're not the leader, nothing to do */ if (rq->core != rq) if (rq->core != rq) return; return; /* find a new leader */ /* find a new leader */ for_each_cpu(t, smt_mask) { for_each_cpu(t, smt_mask) { if (t == cpu) if (t == cpu) continue; continue; core_rq = cpu_rq(t); core_rq = cpu_rq(t); break; break; } } if (WARN_ON_ONCE(!core_rq)) /* impossible */ if (WARN_ON_ONCE(!core_rq)) /* impossible */ return; return; /* copy the shared state to the new leader */ /* copy the shared state to the new leader */ core_rq->core_task_seq = rq->core_task_se core_rq->core_task_seq = rq->core_task_se core_rq->core_pick_seq = rq->core_pick_se core_rq->core_pick_seq = rq->core_pick_se core_rq->core_cookie = rq->core_cookie; core_rq->core_cookie = rq->core_cookie; core_rq->core_forceidle_count = rq->core_forceid core_rq->core_forceidle_count = rq->core_forceid core_rq->core_forceidle_seq = rq->core_forceid core_rq->core_forceidle_seq = rq->core_forceid core_rq->core_forceidle_occupation = rq->core_forceid core_rq->core_forceidle_occupation = rq->core_forceid /* /* * Accounting edge for forced idle is handled in pick * Accounting edge for forced idle is handled in pick * Don't need another one here, since the hotplug thr * Don't need another one here, since the hotplug thr * have a cookie. * have a cookie. */ */ core_rq->core_forceidle_start = 0; core_rq->core_forceidle_start = 0; /* install new leader */ /* install new leader */ for_each_cpu(t, smt_mask) { for_each_cpu(t, smt_mask) { rq = cpu_rq(t); rq = cpu_rq(t); rq->core = core_rq; rq->core = core_rq; } } } } static inline void sched_core_cpu_dying(unsigned int cpu) static inline void sched_core_cpu_dying(unsigned int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); if (rq->core != rq) if (rq->core != rq) rq->core = rq; rq->core = rq; } } #else /* !CONFIG_SCHED_CORE */ #else /* !CONFIG_SCHED_CORE */ static inline void sched_core_cpu_starting(unsigned int cpu) static inline void sched_core_cpu_starting(unsigned int cpu) static inline void sched_core_cpu_deactivate(unsigned int cpu static inline void sched_core_cpu_deactivate(unsigned int cpu static inline void sched_core_cpu_dying(unsigned int cpu) {} static inline void sched_core_cpu_dying(unsigned int cpu) {} static struct task_struct * static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struc pick_next_task(struct rq *rq, struct task_struct *prev, struc { { return __pick_next_task(rq, prev, rf); return __pick_next_task(rq, prev, rf); } } #endif /* CONFIG_SCHED_CORE */ #endif /* CONFIG_SCHED_CORE */ /* /* * Constants for the sched_mode argument of __schedule(). * Constants for the sched_mode argument of __schedule(). * * * The mode argument allows RT enabled kernels to differentia * The mode argument allows RT enabled kernels to differentia * preemption from blocking on an 'sleeping' spin/rwlock. Not * preemption from blocking on an 'sleeping' spin/rwlock. Not * SM_MASK_PREEMPT for !RT has all bits set, which allows the * SM_MASK_PREEMPT for !RT has all bits set, which allows the * optimize the AND operation out and just check for zero. * optimize the AND operation out and just check for zero. */ */ #define SM_NONE 0x0 #define SM_NONE 0x0 #define SM_PREEMPT 0x1 #define SM_PREEMPT 0x1 #define SM_RTLOCK_WAIT 0x2 #define SM_RTLOCK_WAIT 0x2 #ifndef CONFIG_PREEMPT_RT #ifndef CONFIG_PREEMPT_RT # define SM_MASK_PREEMPT (~0U) # define SM_MASK_PREEMPT (~0U) #else #else # define SM_MASK_PREEMPT SM_PREEMPT # define SM_MASK_PREEMPT SM_PREEMPT #endif #endif /* /* * __schedule() is the main scheduler function. * __schedule() is the main scheduler function. * * * The main means of driving the scheduler and thus entering * The main means of driving the scheduler and thus entering * * * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. * * * 2. TIF_NEED_RESCHED flag is checked on interrupt and use * 2. TIF_NEED_RESCHED flag is checked on interrupt and use * paths. For example, see arch/x86/entry_64.S. * paths. For example, see arch/x86/entry_64.S. * * * To drive preemption between tasks, the scheduler sets * To drive preemption between tasks, the scheduler sets * interrupt handler scheduler_tick(). * interrupt handler scheduler_tick(). * * * 3. Wakeups don't really cause entry into schedule(). The * 3. Wakeups don't really cause entry into schedule(). The * task to the run-queue and that's it. * task to the run-queue and that's it. * * * Now, if the new task added to the run-queue preempts * Now, if the new task added to the run-queue preempts * task, then the wakeup sets TIF_NEED_RESCHED and sched * task, then the wakeup sets TIF_NEED_RESCHED and sched * called on the nearest possible occasion: * called on the nearest possible occasion: * * * - If the kernel is preemptible (CONFIG_PREEMPTION=y) * - If the kernel is preemptible (CONFIG_PREEMPTION=y) * * * - in syscall or exception context, at the next out * - in syscall or exception context, at the next out * preempt_enable(). (this might be as soon as the * preempt_enable(). (this might be as soon as the * spin_unlock()!) * spin_unlock()!) * * * - in IRQ context, return from interrupt-handler to * - in IRQ context, return from interrupt-handler to * preemptible context * preemptible context * * * - If the kernel is not preemptible (CONFIG_PREEMPTIO * - If the kernel is not preemptible (CONFIG_PREEMPTIO * then at the next: * then at the next: * * * - cond_resched() call * - cond_resched() call * - explicit schedule() call * - explicit schedule() call * - return from syscall or exception to user-space * - return from syscall or exception to user-space * - return from interrupt-handler to user-space * - return from interrupt-handler to user-space * * * WARNING: must be called with preemption disabled! * WARNING: must be called with preemption disabled! */ */ static void __sched notrace __schedule(unsigned int sched_mod static void __sched notrace __schedule(unsigned int sched_mod { { struct task_struct *prev, *next; struct task_struct *prev, *next; unsigned long *switch_count; unsigned long *switch_count; unsigned long prev_state; unsigned long prev_state; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; int cpu; int cpu; cpu = smp_processor_id(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rq = cpu_rq(cpu); prev = rq->curr; prev = rq->curr; schedule_debug(prev, !!sched_mode); schedule_debug(prev, !!sched_mode); if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); hrtick_clear(rq); local_irq_disable(); local_irq_disable(); rcu_note_context_switch(!!sched_mode); rcu_note_context_switch(!!sched_mode); /* /* * Make sure that signal_pending_state()->signal_pend * Make sure that signal_pending_state()->signal_pend * can't be reordered with __set_current_state(TASK_I * can't be reordered with __set_current_state(TASK_I * done by the caller to avoid the race with signal_w * done by the caller to avoid the race with signal_w * * * __set_current_state(@state) signal_wake_u * __set_current_state(@state) signal_wake_u * schedule() set_tsk_thr * schedule() set_tsk_thr * wake_up_sta * wake_up_sta * LOCK rq->lock LOCK p->p * LOCK rq->lock LOCK p->p * smp_mb__after_spinlock() smp_mb__a * smp_mb__after_spinlock() smp_mb__a * if (signal_pending_state()) if (p->st * if (signal_pending_state()) if (p->st * * * Also, the membarrier system call requires a full m * Also, the membarrier system call requires a full m * after coming from user-space, before storing to rq * after coming from user-space, before storing to rq */ */ rq_lock(rq, &rf); rq_lock(rq, &rf); smp_mb__after_spinlock(); smp_mb__after_spinlock(); /* Promote REQ to ACT */ /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; rq->clock_update_flags <<= 1; update_rq_clock(rq); update_rq_clock(rq); rq->clock_update_flags = RQCF_UPDATED; rq->clock_update_flags = RQCF_UPDATED; switch_count = &prev->nivcsw; switch_count = &prev->nivcsw; /* /* * We must load prev->state once (task_struct::state * We must load prev->state once (task_struct::state * that we form a control dependency vs deactivate_ta * that we form a control dependency vs deactivate_ta */ */ prev_state = READ_ONCE(prev->__state); prev_state = READ_ONCE(prev->__state); if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { if (signal_pending_state(prev_state, prev)) { if (signal_pending_state(prev_state, prev)) { WRITE_ONCE(prev->__state, TASK_RUNNIN WRITE_ONCE(prev->__state, TASK_RUNNIN } else { } else { prev->sched_contributes_to_load = prev->sched_contributes_to_load = (prev_state & TASK_UNINTERRUP (prev_state & TASK_UNINTERRUP !(prev_state & TASK_NOLOAD) & !(prev_state & TASK_NOLOAD) & !(prev_state & TASK_FROZEN); !(prev_state & TASK_FROZEN); if (prev->sched_contributes_to_load) if (prev->sched_contributes_to_load) rq->nr_uninterruptible++; rq->nr_uninterruptible++; /* /* * __schedule() ttwu( * __schedule() ttwu( * prev_state = prev->state; if * prev_state = prev->state; if * if (prev_state) g * if (prev_state) g * p->on_rq = 0; smp * p->on_rq = 0; smp * p-> * p-> * * * Where __schedule() and ttwu() have * Where __schedule() and ttwu() have * * * After this, schedule() must not ca * After this, schedule() must not ca */ */ deactivate_task(rq, prev, DEQUEUE_SLE deactivate_task(rq, prev, DEQUEUE_SLE if (prev->in_iowait) { if (prev->in_iowait) { atomic_inc(&rq->nr_iowait); atomic_inc(&rq->nr_iowait); delayacct_blkio_start(); delayacct_blkio_start(); } } } } switch_count = &prev->nvcsw; switch_count = &prev->nvcsw; } } next = pick_next_task(rq, prev, &rf); next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_tsk_need_resched(prev); clear_preempt_need_resched(); clear_preempt_need_resched(); #ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG rq->last_seen_need_resched_ns = 0; rq->last_seen_need_resched_ns = 0; #endif #endif if (likely(prev != next)) { if (likely(prev != next)) { rq->nr_switches++; rq->nr_switches++; /* /* * RCU users of rcu_dereference(rq->curr) may * RCU users of rcu_dereference(rq->curr) may * changes to task_struct made by pick_next_t * changes to task_struct made by pick_next_t */ */ RCU_INIT_POINTER(rq->curr, next); RCU_INIT_POINTER(rq->curr, next); /* /* * The membarrier system call requires each a * The membarrier system call requires each a * to have a full memory barrier after updati * to have a full memory barrier after updati * rq->curr, before returning to user-space. * rq->curr, before returning to user-space. * * * Here are the schemes providing that barrie * Here are the schemes providing that barrie * various architectures: * various architectures: * - mm ? switch_mm() : mmdrop() for x86, s39 * - mm ? switch_mm() : mmdrop() for x86, s39 * switch_mm() rely on membarrier_arch_swit * switch_mm() rely on membarrier_arch_swit * - finish_lock_switch() for weakly-ordered * - finish_lock_switch() for weakly-ordered * architectures where spin_unlock is a ful * architectures where spin_unlock is a ful * - switch_to() for arm64 (weakly-ordered, s * - switch_to() for arm64 (weakly-ordered, s * is a RELEASE barrier), * is a RELEASE barrier), */ */ ++*switch_count; ++*switch_count; migrate_disable_switch(rq, prev); migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queu psi_sched_switch(prev, next, !task_on_rq_queu trace_sched_switch(sched_mode & SM_MASK_PREEM trace_sched_switch(sched_mode & SM_MASK_PREEM /* Also unlocks the rq: */ /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); rq = context_switch(rq, prev, next, &rf); } else { } else { rq_unpin_lock(rq, &rf); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); raw_spin_rq_unlock_irq(rq); } } } } void __noreturn do_task_dead(void) void __noreturn do_task_dead(void) { { /* Causes final put_task_struct in finish_task_switch /* Causes final put_task_struct in finish_task_switch set_special_state(TASK_DEAD); set_special_state(TASK_DEAD); /* Tell freezer to ignore us: */ /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; current->flags |= PF_NOFREEZE; __schedule(SM_NONE); __schedule(SM_NONE); BUG(); BUG(); /* Avoid "noreturn function does return" - but don't /* Avoid "noreturn function does return" - but don't for (;;) for (;;) cpu_relax(); cpu_relax(); } } static inline void sched_submit_work(struct task_struct *tsk) static inline void sched_submit_work(struct task_struct *tsk) { { static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CO < unsigned int task_flags; unsigned int task_flags; /* | if (task_is_running(tsk)) * Establish LD_WAIT_CONFIG context to ensure none of | return; * will use a blocking primitive -- which would lead < */ < lock_map_acquire_try(&sched_map); < task_flags = tsk->flags; task_flags = tsk->flags; /* /* * If a worker goes to sleep, notify and ask workqueu * If a worker goes to sleep, notify and ask workqueu * wants to wake up a task to maintain concurrency. * wants to wake up a task to maintain concurrency. */ */ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (task_flags & PF_WQ_WORKER) if (task_flags & PF_WQ_WORKER) wq_worker_sleeping(tsk); wq_worker_sleeping(tsk); else else io_wq_worker_sleeping(tsk); io_wq_worker_sleeping(tsk); } } /* /* * spinlock and rwlock must not flush block requests. * spinlock and rwlock must not flush block requests. * deadlock if the callback attempts to acquire a loc * deadlock if the callback attempts to acquire a loc * already acquired. * already acquired. */ */ SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT); /* /* * If we are going to sleep and we have plugged IO qu * If we are going to sleep and we have plugged IO qu * make sure to submit it to avoid deadlocks. * make sure to submit it to avoid deadlocks. */ */ blk_flush_plug(tsk->plug, true); blk_flush_plug(tsk->plug, true); < lock_map_release(&sched_map); < } } static void sched_update_worker(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { { if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { if (tsk->flags & PF_WQ_WORKER) if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); wq_worker_running(tsk); else else io_wq_worker_running(tsk); io_wq_worker_running(tsk); } } } } static __always_inline void __schedule_loop(unsigned int sche | asmlinkage __visible void __sched schedule(void) { { > struct task_struct *tsk = current; > > sched_submit_work(tsk); do { do { preempt_disable(); preempt_disable(); __schedule(sched_mode); | __schedule(SM_NONE); sched_preempt_enable_no_resched(); sched_preempt_enable_no_resched(); } while (need_resched()); } while (need_resched()); } < < asmlinkage __visible void __sched schedule(void) < { < struct task_struct *tsk = current; < < #ifdef CONFIG_RT_MUTEXES < lockdep_assert(!tsk->sched_rt_mutex); < #endif < < if (!task_is_running(tsk)) < sched_submit_work(tsk); < __schedule_loop(SM_NONE); < sched_update_worker(tsk); sched_update_worker(tsk); } } EXPORT_SYMBOL(schedule); EXPORT_SYMBOL(schedule); /* /* * synchronize_rcu_tasks() makes sure that no task is stuck i * synchronize_rcu_tasks() makes sure that no task is stuck i * state (have scheduled out non-voluntarily) by making sure * state (have scheduled out non-voluntarily) by making sure * tasks have either left the run queue or have gone into use * tasks have either left the run queue or have gone into use * As idle tasks do not do either, they must not ever be pree * As idle tasks do not do either, they must not ever be pree * (schedule out non-voluntarily). * (schedule out non-voluntarily). * * * schedule_idle() is similar to schedule_preempt_disable() e * schedule_idle() is similar to schedule_preempt_disable() e * never enables preemption because it does not call sched_su * never enables preemption because it does not call sched_su */ */ void __sched schedule_idle(void) void __sched schedule_idle(void) { { /* /* * As this skips calling sched_submit_work(), which t * As this skips calling sched_submit_work(), which t * regardless because that function is a nop when the * regardless because that function is a nop when the * TASK_RUNNING state, make sure this isn't used some * TASK_RUNNING state, make sure this isn't used some * current task can be in any other state. Note, idle * current task can be in any other state. Note, idle * TASK_RUNNING state. * TASK_RUNNING state. */ */ WARN_ON_ONCE(current->__state); WARN_ON_ONCE(current->__state); do { do { __schedule(SM_NONE); __schedule(SM_NONE); } while (need_resched()); } while (need_resched()); } } #if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_ #if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_ asmlinkage __visible void __sched schedule_user(void) asmlinkage __visible void __sched schedule_user(void) { { /* /* * If we come here after a random call to set_need_re * If we come here after a random call to set_need_re * or we have been woken up remotely but the IPI has * or we have been woken up remotely but the IPI has * we haven't yet exited the RCU idle mode. Do it her * we haven't yet exited the RCU idle mode. Do it her * we find a better solution. * we find a better solution. * * * NB: There are buggy callers of this function. Ide * NB: There are buggy callers of this function. Ide * should warn if prev_state != CONTEXT_USER, but tha * should warn if prev_state != CONTEXT_USER, but tha * too frequently to make sense yet. * too frequently to make sense yet. */ */ enum ctx_state prev_state = exception_enter(); enum ctx_state prev_state = exception_enter(); schedule(); schedule(); exception_exit(prev_state); exception_exit(prev_state); } } #endif #endif /** /** * schedule_preempt_disabled - called with preemption disable * schedule_preempt_disabled - called with preemption disable * * * Returns with preemption disabled. Note: preempt_count must * Returns with preemption disabled. Note: preempt_count must */ */ void __sched schedule_preempt_disabled(void) void __sched schedule_preempt_disabled(void) { { sched_preempt_enable_no_resched(); sched_preempt_enable_no_resched(); schedule(); schedule(); preempt_disable(); preempt_disable(); } } #ifdef CONFIG_PREEMPT_RT #ifdef CONFIG_PREEMPT_RT void __sched notrace schedule_rtlock(void) void __sched notrace schedule_rtlock(void) { { __schedule_loop(SM_RTLOCK_WAIT); | do { > preempt_disable(); > __schedule(SM_RTLOCK_WAIT); > sched_preempt_enable_no_resched(); > } while (need_resched()); } } NOKPROBE_SYMBOL(schedule_rtlock); NOKPROBE_SYMBOL(schedule_rtlock); #endif #endif static void __sched notrace preempt_schedule_common(void) static void __sched notrace preempt_schedule_common(void) { { do { do { /* /* * Because the function tracer can trace pree * Because the function tracer can trace pree * and it also uses preempt_enable/disable_no * and it also uses preempt_enable/disable_no * NEED_RESCHED is set, the preempt_enable_no * NEED_RESCHED is set, the preempt_enable_no * by the function tracer will call this func * by the function tracer will call this func * cause infinite recursion. * cause infinite recursion. * * * Preemption must be disabled here before th * Preemption must be disabled here before th * tracer can trace. Break up preempt_disable * tracer can trace. Break up preempt_disable * calls. One to disable preemption without f * calls. One to disable preemption without f * traced. The other to still record the pree * traced. The other to still record the pree * which can also be traced by the function t * which can also be traced by the function t */ */ preempt_disable_notrace(); preempt_disable_notrace(); preempt_latency_start(1); preempt_latency_start(1); __schedule(SM_PREEMPT); __schedule(SM_PREEMPT); preempt_latency_stop(1); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); preempt_enable_no_resched_notrace(); /* /* * Check again in case we missed a preemption * Check again in case we missed a preemption * between schedule and now. * between schedule and now. */ */ } while (need_resched()); } while (need_resched()); } } #ifdef CONFIG_PREEMPTION #ifdef CONFIG_PREEMPTION /* /* * This is the entry point to schedule() from in-kernel preem * This is the entry point to schedule() from in-kernel preem * off of preempt_enable. * off of preempt_enable. */ */ asmlinkage __visible void __sched notrace preempt_schedule(vo asmlinkage __visible void __sched notrace preempt_schedule(vo { { /* /* * If there is a non-zero preempt_count or interrupts * If there is a non-zero preempt_count or interrupts * we do not want to preempt the current task. Just r * we do not want to preempt the current task. Just r */ */ if (likely(!preemptible())) if (likely(!preemptible())) return; return; preempt_schedule_common(); preempt_schedule_common(); } } NOKPROBE_SYMBOL(preempt_schedule); NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #ifndef preempt_schedule_dynamic_enabled #ifndef preempt_schedule_dynamic_enabled #define preempt_schedule_dynamic_enabled preempt_sched #define preempt_schedule_dynamic_enabled preempt_sched #define preempt_schedule_dynamic_disabled NULL #define preempt_schedule_dynamic_disabled NULL #endif #endif DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic EXPORT_STATIC_CALL_TRAMP(preempt_schedule); EXPORT_STATIC_CALL_TRAMP(preempt_schedule); #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule); void __sched notrace dynamic_preempt_schedule(void) void __sched notrace dynamic_preempt_schedule(void) { { if (!static_branch_unlikely(&sk_dynamic_preempt_sched if (!static_branch_unlikely(&sk_dynamic_preempt_sched return; return; preempt_schedule(); preempt_schedule(); } } NOKPROBE_SYMBOL(dynamic_preempt_schedule); NOKPROBE_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); EXPORT_SYMBOL(dynamic_preempt_schedule); #endif #endif #endif #endif /** /** * preempt_schedule_notrace - preempt_schedule called by trac * preempt_schedule_notrace - preempt_schedule called by trac * * * The tracing infrastructure uses preempt_enable_notrace to * The tracing infrastructure uses preempt_enable_notrace to * recursion and tracing preempt enabling caused by the traci * recursion and tracing preempt enabling caused by the traci * infrastructure itself. But as tracing can happen in areas * infrastructure itself. But as tracing can happen in areas * from userspace or just about to enter userspace, a preempt * from userspace or just about to enter userspace, a preempt * can occur before user_exit() is called. This will cause th * can occur before user_exit() is called. This will cause th * to be called when the system is still in usermode. * to be called when the system is still in usermode. * * * To prevent this, the preempt_enable_notrace will use this * To prevent this, the preempt_enable_notrace will use this * instead of preempt_schedule() to exit user context if need * instead of preempt_schedule() to exit user context if need * calling the scheduler. * calling the scheduler. */ */ asmlinkage __visible void __sched notrace preempt_schedule_no asmlinkage __visible void __sched notrace preempt_schedule_no { { enum ctx_state prev_ctx; enum ctx_state prev_ctx; if (likely(!preemptible())) if (likely(!preemptible())) return; return; do { do { /* /* * Because the function tracer can trace pree * Because the function tracer can trace pree * and it also uses preempt_enable/disable_no * and it also uses preempt_enable/disable_no * NEED_RESCHED is set, the preempt_enable_no * NEED_RESCHED is set, the preempt_enable_no * by the function tracer will call this func * by the function tracer will call this func * cause infinite recursion. * cause infinite recursion. * * * Preemption must be disabled here before th * Preemption must be disabled here before th * tracer can trace. Break up preempt_disable * tracer can trace. Break up preempt_disable * calls. One to disable preemption without f * calls. One to disable preemption without f * traced. The other to still record the pree * traced. The other to still record the pree * which can also be traced by the function t * which can also be traced by the function t */ */ preempt_disable_notrace(); preempt_disable_notrace(); preempt_latency_start(1); preempt_latency_start(1); /* /* * Needs preempt disabled in case user_exit() * Needs preempt disabled in case user_exit() * and the tracer calls preempt_enable_notrac * and the tracer calls preempt_enable_notrac * an infinite recursion. * an infinite recursion. */ */ prev_ctx = exception_enter(); prev_ctx = exception_enter(); __schedule(SM_PREEMPT); __schedule(SM_PREEMPT); exception_exit(prev_ctx); exception_exit(prev_ctx); preempt_latency_stop(1); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); preempt_enable_no_resched_notrace(); } while (need_resched()); } while (need_resched()); } } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #ifndef preempt_schedule_notrace_dynamic_enabled #ifndef preempt_schedule_notrace_dynamic_enabled #define preempt_schedule_notrace_dynamic_enabled preem #define preempt_schedule_notrace_dynamic_enabled preem #define preempt_schedule_notrace_dynamic_disabled NULL #define preempt_schedule_notrace_dynamic_disabled NULL #endif #endif DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_not static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_not void __sched notrace dynamic_preempt_schedule_notrace(void) void __sched notrace dynamic_preempt_schedule_notrace(void) { { if (!static_branch_unlikely(&sk_dynamic_preempt_sched if (!static_branch_unlikely(&sk_dynamic_preempt_sched return; return; preempt_schedule_notrace(); preempt_schedule_notrace(); } } NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); EXPORT_SYMBOL(dynamic_preempt_schedule_notrace); #endif #endif #endif #endif #endif /* CONFIG_PREEMPTION */ #endif /* CONFIG_PREEMPTION */ /* /* * This is the entry point to schedule() from kernel preempti * This is the entry point to schedule() from kernel preempti * off of irq context. * off of irq context. * Note, that this is called and return with irqs disabled. T * Note, that this is called and return with irqs disabled. T * protect us against recursive calling from irq. * protect us against recursive calling from irq. */ */ asmlinkage __visible void __sched preempt_schedule_irq(void) asmlinkage __visible void __sched preempt_schedule_irq(void) { { enum ctx_state prev_state; enum ctx_state prev_state; /* Catch callers which need to be fixed */ /* Catch callers which need to be fixed */ BUG_ON(preempt_count() || !irqs_disabled()); BUG_ON(preempt_count() || !irqs_disabled()); prev_state = exception_enter(); prev_state = exception_enter(); do { do { preempt_disable(); preempt_disable(); local_irq_enable(); local_irq_enable(); __schedule(SM_PREEMPT); __schedule(SM_PREEMPT); local_irq_disable(); local_irq_disable(); sched_preempt_enable_no_resched(); sched_preempt_enable_no_resched(); } while (need_resched()); } while (need_resched()); exception_exit(prev_state); exception_exit(prev_state); } } int default_wake_function(wait_queue_entry_t *curr, unsigned int default_wake_function(wait_queue_entry_t *curr, unsigned void *key) void *key) { { WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_f WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_f return try_to_wake_up(curr->private, mode, wake_flags return try_to_wake_up(curr->private, mode, wake_flags } } EXPORT_SYMBOL(default_wake_function); EXPORT_SYMBOL(default_wake_function); static void __setscheduler_prio(struct task_struct *p, int pr static void __setscheduler_prio(struct task_struct *p, int pr { { if (dl_prio(prio)) if (dl_prio(prio)) p->sched_class = &dl_sched_class; p->sched_class = &dl_sched_class; else if (rt_prio(prio)) else if (rt_prio(prio)) p->sched_class = &rt_sched_class; p->sched_class = &rt_sched_class; else else p->sched_class = &fair_sched_class; p->sched_class = &fair_sched_class; p->prio = prio; p->prio = prio; } } #ifdef CONFIG_RT_MUTEXES #ifdef CONFIG_RT_MUTEXES /* < * Would be more useful with typeof()/auto_type but they don' < * bit-fields. Since it's a local thing, use int. Keep the ge < * name such that if someone were to implement this function < * notes. < */ < #define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; } < < void rt_mutex_pre_schedule(void) < { < lockdep_assert(!fetch_and_set(current->sched_rt_mutex < sched_submit_work(current); < } < < void rt_mutex_schedule(void) < { < lockdep_assert(current->sched_rt_mutex); < __schedule_loop(SM_NONE); < } < < void rt_mutex_post_schedule(void) < { < sched_update_worker(current); < lockdep_assert(fetch_and_set(current->sched_rt_mutex, < } < < static inline int __rt_effective_prio(struct task_struct *pi_ static inline int __rt_effective_prio(struct task_struct *pi_ { { if (pi_task) if (pi_task) prio = min(prio, pi_task->prio); prio = min(prio, pi_task->prio); return prio; return prio; } } static inline int rt_effective_prio(struct task_struct *p, in static inline int rt_effective_prio(struct task_struct *p, in { { struct task_struct *pi_task = rt_mutex_get_top_task(p struct task_struct *pi_task = rt_mutex_get_top_task(p return __rt_effective_prio(pi_task, prio); return __rt_effective_prio(pi_task, prio); } } /* /* * rt_mutex_setprio - set the current priority of a task * rt_mutex_setprio - set the current priority of a task * @p: task to boost * @p: task to boost * @pi_task: donor task * @pi_task: donor task * * * This function changes the 'effective' priority of a task. * This function changes the 'effective' priority of a task. * not touch ->normal_prio like __setscheduler(). * not touch ->normal_prio like __setscheduler(). * * * Used by the rt_mutex code to implement priority inheritanc * Used by the rt_mutex code to implement priority inheritanc * logic. Call site only calls if the priority of the task ch * logic. Call site only calls if the priority of the task ch */ */ void rt_mutex_setprio(struct task_struct *p, struct task_stru void rt_mutex_setprio(struct task_struct *p, struct task_stru { { int prio, oldprio, queued, running, queue_flag = int prio, oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK const struct sched_class *prev_class; const struct sched_class *prev_class; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; /* XXX used to be waiter->prio, not waiter->task->pri /* XXX used to be waiter->prio, not waiter->task->pri prio = __rt_effective_prio(pi_task, p->normal_prio); prio = __rt_effective_prio(pi_task, p->normal_prio); /* /* * If nothing changed; bail early. * If nothing changed; bail early. */ */ if (p->pi_top_task == pi_task && prio == p->prio && ! if (p->pi_top_task == pi_task && prio == p->prio && ! return; return; rq = __task_rq_lock(p, &rf); rq = __task_rq_lock(p, &rf); update_rq_clock(rq); update_rq_clock(rq); /* /* * Set under pi_lock && rq->lock, such that the value * Set under pi_lock && rq->lock, such that the value * either lock. * either lock. * * * Note that there is loads of tricky to make this po * Note that there is loads of tricky to make this po * right. rt_mutex_slowunlock()+rt_mutex_postunlock() * right. rt_mutex_slowunlock()+rt_mutex_postunlock() * ensure a task is de-boosted (pi_task is set to NUL * ensure a task is de-boosted (pi_task is set to NUL * task is allowed to run again (and can exit). This * task is allowed to run again (and can exit). This * points to a blocked task -- which guarantees the t * points to a blocked task -- which guarantees the t */ */ p->pi_top_task = pi_task; p->pi_top_task = pi_task; /* /* * For FIFO/RR we only need to set prio, if that matc * For FIFO/RR we only need to set prio, if that matc */ */ if (prio == p->prio && !dl_prio(prio)) if (prio == p->prio && !dl_prio(prio)) goto out_unlock; goto out_unlock; /* /* * Idle task boosting is a nono in general. There is * Idle task boosting is a nono in general. There is * exception, when PREEMPT_RT and NOHZ is active: * exception, when PREEMPT_RT and NOHZ is active: * * * The idle task calls get_next_timer_interrupt() and * The idle task calls get_next_timer_interrupt() and * the timer wheel base->lock on the CPU and another * the timer wheel base->lock on the CPU and another * to access the timer (probably to cancel it). We ca * to access the timer (probably to cancel it). We ca * ignore the boosting request, as the idle CPU runs * ignore the boosting request, as the idle CPU runs * with interrupts disabled and will complete the loc * with interrupts disabled and will complete the loc * protected section without being interrupted. So th * protected section without being interrupted. So th * real need to boost. * real need to boost. */ */ if (unlikely(p == rq->idle)) { if (unlikely(p == rq->idle)) { WARN_ON(p != rq->curr); WARN_ON(p != rq->curr); WARN_ON(p->pi_blocked_on); WARN_ON(p->pi_blocked_on); goto out_unlock; goto out_unlock; } } trace_sched_pi_setprio(p, pi_task); trace_sched_pi_setprio(p, pi_task); oldprio = p->prio; oldprio = p->prio; if (oldprio == prio) if (oldprio == prio) queue_flag &= ~DEQUEUE_MOVE; queue_flag &= ~DEQUEUE_MOVE; prev_class = p->sched_class; prev_class = p->sched_class; queued = task_on_rq_queued(p); queued = task_on_rq_queued(p); running = task_current(rq, p); running = task_current(rq, p); if (queued) if (queued) dequeue_task(rq, p, queue_flag); dequeue_task(rq, p, queue_flag); if (running) if (running) put_prev_task(rq, p); put_prev_task(rq, p); /* /* * Boosting condition are: * Boosting condition are: * 1. -rt task is running and holds mutex A * 1. -rt task is running and holds mutex A * --> -dl task blocks on mutex A * --> -dl task blocks on mutex A * * * 2. -dl task is running and holds mutex A * 2. -dl task is running and holds mutex A * --> -dl task blocks on mutex A and could pree * --> -dl task blocks on mutex A and could pree * running task * running task */ */ if (dl_prio(prio)) { if (dl_prio(prio)) { if (!dl_prio(p->normal_prio) || if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl)) dl_entity_preempt(&pi_task->dl, &p->dl)) p->dl.pi_se = pi_task->dl.pi_se; p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; queue_flag |= ENQUEUE_REPLENISH; } else { } else { p->dl.pi_se = &p->dl; p->dl.pi_se = &p->dl; } } } else if (rt_prio(prio)) { } else if (rt_prio(prio)) { if (dl_prio(oldprio)) if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; p->dl.pi_se = &p->dl; if (oldprio < prio) if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; queue_flag |= ENQUEUE_HEAD; } else { } else { if (dl_prio(oldprio)) if (dl_prio(oldprio)) p->dl.pi_se = &p->dl; p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) if (rt_prio(oldprio)) p->rt.timeout = 0; p->rt.timeout = 0; } } __setscheduler_prio(p, prio); __setscheduler_prio(p, prio); if (queued) if (queued) enqueue_task(rq, p, queue_flag); enqueue_task(rq, p, queue_flag); if (running) if (running) set_next_task(rq, p); set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio); out_unlock: out_unlock: /* Avoid rq from going away on us: */ /* Avoid rq from going away on us: */ preempt_disable(); preempt_disable(); rq_unpin_lock(rq, &rf); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); __balance_callbacks(rq); raw_spin_rq_unlock(rq); raw_spin_rq_unlock(rq); preempt_enable(); preempt_enable(); } } #else #else static inline int rt_effective_prio(struct task_struct *p, in static inline int rt_effective_prio(struct task_struct *p, in { { return prio; return prio; } } #endif #endif void set_user_nice(struct task_struct *p, long nice) void set_user_nice(struct task_struct *p, long nice) { { bool queued, running; bool queued, running; int old_prio; int old_prio; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; if (task_nice(p) == nice || nice < MIN_NICE || nice > if (task_nice(p) == nice || nice < MIN_NICE || nice > return; return; /* /* * We have to be careful, if called from sys_setprior * We have to be careful, if called from sys_setprior * the task might be in the middle of scheduling on a * the task might be in the middle of scheduling on a */ */ rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); update_rq_clock(rq); update_rq_clock(rq); /* /* * The RT priorities are set via sched_setscheduler() * The RT priorities are set via sched_setscheduler() * allow the 'normal' nice value to be set - but as e * allow the 'normal' nice value to be set - but as e * it won't have any effect on scheduling until the t * it won't have any effect on scheduling until the t * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; goto out_unlock; } } queued = task_on_rq_queued(p); queued = task_on_rq_queued(p); running = task_current(rq, p); running = task_current(rq, p); if (queued) if (queued) dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO if (running) if (running) put_prev_task(rq, p); put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p, true); set_load_weight(p, true); old_prio = p->prio; old_prio = p->prio; p->prio = effective_prio(p); p->prio = effective_prio(p); if (queued) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE if (running) if (running) set_next_task(rq, p); set_next_task(rq, p); /* /* * If the task increased its priority or is running a * If the task increased its priority or is running a * lowered its priority, then reschedule its CPU: * lowered its priority, then reschedule its CPU: */ */ p->sched_class->prio_changed(rq, p, old_prio); p->sched_class->prio_changed(rq, p, old_prio); out_unlock: out_unlock: task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); } } EXPORT_SYMBOL(set_user_nice); EXPORT_SYMBOL(set_user_nice); /* /* * is_nice_reduction - check if nice value is an actual reduc * is_nice_reduction - check if nice value is an actual reduc * * * Similar to can_nice() but does not perform a capability ch * Similar to can_nice() but does not perform a capability ch * * * @p: task * @p: task * @nice: nice value * @nice: nice value */ */ static bool is_nice_reduction(const struct task_struct *p, co static bool is_nice_reduction(const struct task_struct *p, co { { /* Convert nice value [19,-20] to rlimit style value /* Convert nice value [19,-20] to rlimit style value int nice_rlim = nice_to_rlimit(nice); int nice_rlim = nice_to_rlimit(nice); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); return (nice_rlim <= task_rlimit(p, RLIMIT_NICE)); } } /* /* * can_nice - check if a task can reduce its nice value * can_nice - check if a task can reduce its nice value * @p: task * @p: task * @nice: nice value * @nice: nice value */ */ int can_nice(const struct task_struct *p, const int nice) int can_nice(const struct task_struct *p, const int nice) { { return is_nice_reduction(p, nice) || capable(CAP_SYS_ return is_nice_reduction(p, nice) || capable(CAP_SYS_ } } #ifdef __ARCH_WANT_SYS_NICE #ifdef __ARCH_WANT_SYS_NICE /* /* * sys_nice - change the priority of the current process. * sys_nice - change the priority of the current process. * @increment: priority increment * @increment: priority increment * * * sys_setpriority is a more generic, but much slower functio * sys_setpriority is a more generic, but much slower functio * does similar things. * does similar things. */ */ SYSCALL_DEFINE1(nice, int, increment) SYSCALL_DEFINE1(nice, int, increment) { { long nice, retval; long nice, retval; /* /* * Setpriority might change our priority at the same * Setpriority might change our priority at the same * We don't have to worry. Conceptually one call occu * We don't have to worry. Conceptually one call occu * and we have a single winner. * and we have a single winner. */ */ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH) increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH) nice = task_nice(current) + increment; nice = task_nice(current) + increment; nice = clamp_val(nice, MIN_NICE, MAX_NICE); nice = clamp_val(nice, MIN_NICE, MAX_NICE); if (increment < 0 && !can_nice(current, nice)) if (increment < 0 && !can_nice(current, nice)) return -EPERM; return -EPERM; retval = security_task_setnice(current, nice); retval = security_task_setnice(current, nice); if (retval) if (retval) return retval; return retval; set_user_nice(current, nice); set_user_nice(current, nice); return 0; return 0; } } #endif #endif /** /** * task_prio - return the priority value of a given task. * task_prio - return the priority value of a given task. * @p: the task in question. * @p: the task in question. * * * Return: The priority value as seen by users in /proc. * Return: The priority value as seen by users in /proc. * * * sched policy return value kernel prio user pr * sched policy return value kernel prio user pr * * * normal, batch, idle [0 ... 39] [100 ... 139] * normal, batch, idle [0 ... 39] [100 ... 139] * fifo, rr [-2 ... -100] [98 ... 0] [1 ... * fifo, rr [-2 ... -100] [98 ... 0] [1 ... * deadline -101 -1 * deadline -101 -1 */ */ int task_prio(const struct task_struct *p) int task_prio(const struct task_struct *p) { { return p->prio - MAX_RT_PRIO; return p->prio - MAX_RT_PRIO; } } /** /** * idle_cpu - is a given CPU idle currently? * idle_cpu - is a given CPU idle currently? * @cpu: the processor in question. * @cpu: the processor in question. * * * Return: 1 if the CPU is currently idle. 0 otherwise. * Return: 1 if the CPU is currently idle. 0 otherwise. */ */ int idle_cpu(int cpu) int idle_cpu(int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); if (rq->curr != rq->idle) if (rq->curr != rq->idle) return 0; return 0; if (rq->nr_running) if (rq->nr_running) return 0; return 0; #ifdef CONFIG_SMP #ifdef CONFIG_SMP if (rq->ttwu_pending) if (rq->ttwu_pending) return 0; return 0; #endif #endif return 1; return 1; } } /** /** * available_idle_cpu - is a given CPU idle for enqueuing wor * available_idle_cpu - is a given CPU idle for enqueuing wor * @cpu: the CPU in question. * @cpu: the CPU in question. * * * Return: 1 if the CPU is currently idle. 0 otherwise. * Return: 1 if the CPU is currently idle. 0 otherwise. */ */ int available_idle_cpu(int cpu) int available_idle_cpu(int cpu) { { if (!idle_cpu(cpu)) if (!idle_cpu(cpu)) return 0; return 0; if (vcpu_is_preempted(cpu)) if (vcpu_is_preempted(cpu)) return 0; return 0; return 1; return 1; } } /** /** * idle_task - return the idle task for a given CPU. * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * @cpu: the processor in question. * * * Return: The idle task for the CPU @cpu. * Return: The idle task for the CPU @cpu. */ */ struct task_struct *idle_task(int cpu) struct task_struct *idle_task(int cpu) { { return cpu_rq(cpu)->idle; return cpu_rq(cpu)->idle; } } #ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE int sched_core_idle_cpu(int cpu) int sched_core_idle_cpu(int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); if (sched_core_enabled(rq) && rq->curr == rq->idle) if (sched_core_enabled(rq) && rq->curr == rq->idle) return 1; return 1; return idle_cpu(cpu); return idle_cpu(cpu); } } #endif #endif #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * This function computes an effective utilization for the gi * This function computes an effective utilization for the gi * used for frequency selection given the linear relation: f * used for frequency selection given the linear relation: f * * * The scheduler tracks the following metrics: * The scheduler tracks the following metrics: * * * cpu_util_{cfs,rt,dl,irq}() * cpu_util_{cfs,rt,dl,irq}() * cpu_bw_dl() * cpu_bw_dl() * * * Where the cfs,rt and dl util numbers are tracked with the * Where the cfs,rt and dl util numbers are tracked with the * synchronized windows and are thus directly comparable. * synchronized windows and are thus directly comparable. * * * The cfs,rt,dl utilization are the running times measured w * The cfs,rt,dl utilization are the running times measured w * which excludes things like IRQ and steal-time. These latte * which excludes things like IRQ and steal-time. These latte * in the irq utilization. * in the irq utilization. * * * The DL bandwidth number otoh is not a measured metric but * The DL bandwidth number otoh is not a measured metric but * based on the task model parameters and gives the minimal u * based on the task model parameters and gives the minimal u * required to meet deadlines. * required to meet deadlines. */ */ unsigned long effective_cpu_util(int cpu, unsigned long util_ unsigned long effective_cpu_util(int cpu, unsigned long util_ enum cpu_util_type type, enum cpu_util_type type, struct task_struct *p) struct task_struct *p) { { unsigned long dl_util, util, irq, max; unsigned long dl_util, util, irq, max; struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); max = arch_scale_cpu_capacity(cpu); max = arch_scale_cpu_capacity(cpu); if (!uclamp_is_used() && if (!uclamp_is_used() && type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq-> type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq-> return max; return max; } } /* /* * Early check to see if IRQ/steal time saturates the * Early check to see if IRQ/steal time saturates the * because of inaccuracies in how we track these -- s * because of inaccuracies in how we track these -- s * update_irq_load_avg(). * update_irq_load_avg(). */ */ irq = cpu_util_irq(rq); irq = cpu_util_irq(rq); if (unlikely(irq >= max)) if (unlikely(irq >= max)) return max; return max; /* /* * Because the time spend on RT/DL tasks is visible a * Because the time spend on RT/DL tasks is visible a * CFS tasks and we use the same metric to track the * CFS tasks and we use the same metric to track the * utilization (PELT windows are synchronized) we can * utilization (PELT windows are synchronized) we can * to obtain the CPU's actual utilization. * to obtain the CPU's actual utilization. * * * CFS and RT utilization can be boosted or capped, d * CFS and RT utilization can be boosted or capped, d * utilization clamp constraints requested by current * utilization clamp constraints requested by current * tasks. * tasks. * When there are no CFS RUNNABLE tasks, clamps are r * When there are no CFS RUNNABLE tasks, clamps are r * frequency will be gracefully reduced with the util * frequency will be gracefully reduced with the util */ */ util = util_cfs + cpu_util_rt(rq); util = util_cfs + cpu_util_rt(rq); if (type == FREQUENCY_UTIL) if (type == FREQUENCY_UTIL) util = uclamp_rq_util_with(rq, util, p); util = uclamp_rq_util_with(rq, util, p); dl_util = cpu_util_dl(rq); dl_util = cpu_util_dl(rq); /* /* * For frequency selection we do not make cpu_util_dl * For frequency selection we do not make cpu_util_dl * of this sum because we want to use cpu_bw_dl() lat * of this sum because we want to use cpu_bw_dl() lat * to check if the CFS+RT+DL sum is saturated (ie. no * to check if the CFS+RT+DL sum is saturated (ie. no * that we select f_max when there is no idle time. * that we select f_max when there is no idle time. * * * NOTE: numerical errors or stop class might cause u * NOTE: numerical errors or stop class might cause u * saturation when we should -- something for later. * saturation when we should -- something for later. */ */ if (util + dl_util >= max) if (util + dl_util >= max) return max; return max; /* /* * OTOH, for energy computation we need the estimated * OTOH, for energy computation we need the estimated * include util_dl and ignore dl_bw. * include util_dl and ignore dl_bw. */ */ if (type == ENERGY_UTIL) if (type == ENERGY_UTIL) util += dl_util; util += dl_util; /* /* * There is still idle time; further improve the numb * There is still idle time; further improve the numb * irq metric. Because IRQ/steal time is hidden from * irq metric. Because IRQ/steal time is hidden from * need to scale the task numbers: * need to scale the task numbers: * * * max - irq * max - irq * U' = irq + --------- * U * U' = irq + --------- * U * max * max */ */ util = scale_irq_capacity(util, irq, max); util = scale_irq_capacity(util, irq, max); util += irq; util += irq; /* /* * Bandwidth required by DEADLINE must always be gran * Bandwidth required by DEADLINE must always be gran * FAIR and RT, we use blocked utilization of IDLE CP * FAIR and RT, we use blocked utilization of IDLE CP * to gracefully reduce the frequency when no tasks s * to gracefully reduce the frequency when no tasks s * periods of time. * periods of time. * * * Ideally we would like to set bw_dl as min/guarante * Ideally we would like to set bw_dl as min/guarante * bw_dl as requested freq. However, cpufreq is not y * bw_dl as requested freq. However, cpufreq is not y * an interface. So, we only do the latter for now. * an interface. So, we only do the latter for now. */ */ if (type == FREQUENCY_UTIL) if (type == FREQUENCY_UTIL) util += cpu_bw_dl(rq); util += cpu_bw_dl(rq); return min(max, util); return min(max, util); } } unsigned long sched_cpu_util(int cpu) unsigned long sched_cpu_util(int cpu) { { return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENE return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENE } } #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ /** /** * find_process_by_pid - find a process with a matching PID v * find_process_by_pid - find a process with a matching PID v * @pid: the pid in question. * @pid: the pid in question. * * * The task of @pid, if found. %NULL otherwise. * The task of @pid, if found. %NULL otherwise. */ */ static struct task_struct *find_process_by_pid(pid_t pid) static struct task_struct *find_process_by_pid(pid_t pid) { { return pid ? find_task_by_vpid(pid) : current; return pid ? find_task_by_vpid(pid) : current; } } /* /* * sched_setparam() passes in -1 for its policy, to let the f * sched_setparam() passes in -1 for its policy, to let the f * it calls know not to change it. * it calls know not to change it. */ */ #define SETPARAM_POLICY -1 #define SETPARAM_POLICY -1 static void __setscheduler_params(struct task_struct *p, static void __setscheduler_params(struct task_struct *p, const struct sched_attr *attr) const struct sched_attr *attr) { { int policy = attr->sched_policy; int policy = attr->sched_policy; if (policy == SETPARAM_POLICY) if (policy == SETPARAM_POLICY) policy = p->policy; policy = p->policy; p->policy = policy; p->policy = policy; if (dl_policy(policy)) if (dl_policy(policy)) __setparam_dl(p, attr); __setparam_dl(p, attr); else if (fair_policy(policy)) else if (fair_policy(policy)) p->static_prio = NICE_TO_PRIO(attr->sched_nic p->static_prio = NICE_TO_PRIO(attr->sched_nic /* /* * __sched_setscheduler() ensures attr->sched_priorit * __sched_setscheduler() ensures attr->sched_priorit * !rt_policy. Always setting this ensures that thing * !rt_policy. Always setting this ensures that thing * getparam()/getattr() don't report silly values for * getparam()/getattr() don't report silly values for */ */ p->rt_priority = attr->sched_priority; p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); p->normal_prio = normal_prio(p); set_load_weight(p, true); set_load_weight(p, true); } } /* /* * Check the target process has a UID that matches the curren * Check the target process has a UID that matches the curren */ */ static bool check_same_owner(struct task_struct *p) static bool check_same_owner(struct task_struct *p) { { const struct cred *cred = current_cred(), *pcred; const struct cred *cred = current_cred(), *pcred; bool match; bool match; rcu_read_lock(); rcu_read_lock(); pcred = __task_cred(p); pcred = __task_cred(p); match = (uid_eq(cred->euid, pcred->euid) || match = (uid_eq(cred->euid, pcred->euid) || uid_eq(cred->euid, pcred->uid)); uid_eq(cred->euid, pcred->uid)); rcu_read_unlock(); rcu_read_unlock(); return match; return match; } } /* /* * Allow unprivileged RT tasks to decrease priority. * Allow unprivileged RT tasks to decrease priority. * Only issue a capable test if needed and only once to avoid * Only issue a capable test if needed and only once to avoid * event on permitted non-privileged operations: * event on permitted non-privileged operations: */ */ static int user_check_sched_setscheduler(struct task_struct * static int user_check_sched_setscheduler(struct task_struct * const struct sched_a const struct sched_a int policy, int rese int policy, int rese { { if (fair_policy(policy)) { if (fair_policy(policy)) { if (attr->sched_nice < task_nice(p) && if (attr->sched_nice < task_nice(p) && !is_nice_reduction(p, attr->sched_nice)) !is_nice_reduction(p, attr->sched_nice)) goto req_priv; goto req_priv; } } if (rt_policy(policy)) { if (rt_policy(policy)) { unsigned long rlim_rtprio = task_rlimit(p, RL unsigned long rlim_rtprio = task_rlimit(p, RL /* Can't set/change the rt policy: */ /* Can't set/change the rt policy: */ if (policy != p->policy && !rlim_rtprio) if (policy != p->policy && !rlim_rtprio) goto req_priv; goto req_priv; /* Can't increase priority: */ /* Can't increase priority: */ if (attr->sched_priority > p->rt_priority && if (attr->sched_priority > p->rt_priority && attr->sched_priority > rlim_rtprio) attr->sched_priority > rlim_rtprio) goto req_priv; goto req_priv; } } /* /* * Can't set/change SCHED_DEADLINE policy at all for * Can't set/change SCHED_DEADLINE policy at all for * (safest behavior); in the future we would like to * (safest behavior); in the future we would like to * unprivileged DL tasks to increase their relative d * unprivileged DL tasks to increase their relative d * or reduce their runtime (both ways reducing utiliz * or reduce their runtime (both ways reducing utiliz */ */ if (dl_policy(policy)) if (dl_policy(policy)) goto req_priv; goto req_priv; /* /* * Treat SCHED_IDLE as nice 20. Only allow a switch t * Treat SCHED_IDLE as nice 20. Only allow a switch t * SCHED_NORMAL if the RLIMIT_NICE would normally per * SCHED_NORMAL if the RLIMIT_NICE would normally per */ */ if (task_has_idle_policy(p) && !idle_policy(policy)) if (task_has_idle_policy(p) && !idle_policy(policy)) if (!is_nice_reduction(p, task_nice(p))) if (!is_nice_reduction(p, task_nice(p))) goto req_priv; goto req_priv; } } /* Can't change other user's priorities: */ /* Can't change other user's priorities: */ if (!check_same_owner(p)) if (!check_same_owner(p)) goto req_priv; goto req_priv; /* Normal users shall not reset the sched_reset_on_fo /* Normal users shall not reset the sched_reset_on_fo if (p->sched_reset_on_fork && !reset_on_fork) if (p->sched_reset_on_fork && !reset_on_fork) goto req_priv; goto req_priv; return 0; return 0; req_priv: req_priv: if (!capable(CAP_SYS_NICE)) if (!capable(CAP_SYS_NICE)) return -EPERM; return -EPERM; return 0; return 0; } } static int __sched_setscheduler(struct task_struct *p, static int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr const struct sched_attr *attr bool user, bool pi) bool user, bool pi) { { int oldpolicy = -1, policy = attr->sched_policy; int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; const struct sched_class *prev_class; struct balance_callback *head; struct balance_callback *head; struct rq_flags rf; struct rq_flags rf; int reset_on_fork; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUE int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUE struct rq *rq; struct rq *rq; bool cpuset_locked = false; bool cpuset_locked = false; /* The pi code expects interrupts enabled */ /* The pi code expects interrupts enabled */ BUG_ON(pi && in_interrupt()); BUG_ON(pi && in_interrupt()); recheck: recheck: /* Double check policy once rq lock held: */ /* Double check policy once rq lock held: */ if (policy < 0) { if (policy < 0) { reset_on_fork = p->sched_reset_on_fork; reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; policy = oldpolicy = p->policy; } else { } else { reset_on_fork = !!(attr->sched_flags & SCHED_ reset_on_fork = !!(attr->sched_flags & SCHED_ if (!valid_policy(policy)) if (!valid_policy(policy)) return -EINVAL; return -EINVAL; } } if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG return -EINVAL; return -EINVAL; /* /* * Valid priorities for SCHED_FIFO and SCHED_RR are * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. * SCHED_BATCH and SCHED_IDLE is 0. */ */ if (attr->sched_priority > MAX_RT_PRIO-1) if (attr->sched_priority > MAX_RT_PRIO-1) return -EINVAL; return -EINVAL; if ((dl_policy(policy) && !__checkparam_dl(attr)) || if ((dl_policy(policy) && !__checkparam_dl(attr)) || (rt_policy(policy) != (attr->sched_priority != 0) (rt_policy(policy) != (attr->sched_priority != 0) return -EINVAL; return -EINVAL; if (user) { if (user) { retval = user_check_sched_setscheduler(p, att retval = user_check_sched_setscheduler(p, att if (retval) if (retval) return retval; return retval; if (attr->sched_flags & SCHED_FLAG_SUGOV) if (attr->sched_flags & SCHED_FLAG_SUGOV) return -EINVAL; return -EINVAL; retval = security_task_setscheduler(p); retval = security_task_setscheduler(p); if (retval) if (retval) return retval; return retval; } } /* Update task specific "requested" clamps */ /* Update task specific "requested" clamps */ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { retval = uclamp_validate(p, attr); retval = uclamp_validate(p, attr); if (retval) if (retval) return retval; return retval; } } /* /* * SCHED_DEADLINE bandwidth accounting relies on stab * SCHED_DEADLINE bandwidth accounting relies on stab * information. * information. */ */ if (dl_policy(policy) || dl_policy(p->policy)) { if (dl_policy(policy) || dl_policy(p->policy)) { cpuset_locked = true; cpuset_locked = true; cpuset_lock(); cpuset_lock(); } } /* /* * Make sure no PI-waiters arrive (or leave) while we * Make sure no PI-waiters arrive (or leave) while we * changing the priority of the task: * changing the priority of the task: * * * To be able to change p->policy safely, the appropr * To be able to change p->policy safely, the appropr * runqueue lock must be held. * runqueue lock must be held. */ */ rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); update_rq_clock(rq); update_rq_clock(rq); /* /* * Changing the policy of the stop threads its a very * Changing the policy of the stop threads its a very */ */ if (p == rq->stop) { if (p == rq->stop) { retval = -EINVAL; retval = -EINVAL; goto unlock; goto unlock; } } /* /* * If not changing anything there's no need to procee * If not changing anything there's no need to procee * but store a possible modification of reset_on_fork * but store a possible modification of reset_on_fork */ */ if (unlikely(policy == p->policy)) { if (unlikely(policy == p->policy)) { if (fair_policy(policy) && attr->sched_nice ! if (fair_policy(policy) && attr->sched_nice ! goto change; goto change; if (rt_policy(policy) && attr->sched_priority if (rt_policy(policy) && attr->sched_priority goto change; goto change; if (dl_policy(policy) && dl_param_changed(p, if (dl_policy(policy) && dl_param_changed(p, goto change; goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP goto change; goto change; p->sched_reset_on_fork = reset_on_fork; p->sched_reset_on_fork = reset_on_fork; retval = 0; retval = 0; goto unlock; goto unlock; } } change: change: if (user) { if (user) { #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED /* /* * Do not allow realtime tasks into groups th * Do not allow realtime tasks into groups th * assigned. * assigned. */ */ if (rt_bandwidth_enabled() && rt_policy(polic if (rt_bandwidth_enabled() && rt_policy(polic task_group(p)->rt_bandwidth.r task_group(p)->rt_bandwidth.r !task_group_is_autogroup(task !task_group_is_autogroup(task retval = -EPERM; retval = -EPERM; goto unlock; goto unlock; } } #endif #endif #ifdef CONFIG_SMP #ifdef CONFIG_SMP if (dl_bandwidth_enabled() && dl_policy(polic if (dl_bandwidth_enabled() && dl_policy(polic !(attr->sched_flags & SCHED_F !(attr->sched_flags & SCHED_F cpumask_t *span = rq->rd->span; cpumask_t *span = rq->rd->span; /* /* * Don't allow tasks with an affinity * Don't allow tasks with an affinity * the entire root_domain to become S * the entire root_domain to become S * will also fail if there's no bandw * will also fail if there's no bandw */ */ if (!cpumask_subset(span, p->cpus_ptr if (!cpumask_subset(span, p->cpus_ptr rq->rd->dl_bw.bw == 0) { rq->rd->dl_bw.bw == 0) { retval = -EPERM; retval = -EPERM; goto unlock; goto unlock; } } } } #endif #endif } } /* Re-check policy now with rq lock held: */ /* Re-check policy now with rq lock held: */ if (unlikely(oldpolicy != -1 && oldpolicy != p->polic if (unlikely(oldpolicy != -1 && oldpolicy != p->polic policy = oldpolicy = -1; policy = oldpolicy = -1; task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); if (cpuset_locked) if (cpuset_locked) cpuset_unlock(); cpuset_unlock(); goto recheck; goto recheck; } } /* /* * If setscheduling to SCHED_DEADLINE (or changing th * If setscheduling to SCHED_DEADLINE (or changing th * of a SCHED_DEADLINE task) we need to check if enou * of a SCHED_DEADLINE task) we need to check if enou * is available. * is available. */ */ if ((dl_policy(policy) || dl_task(p)) && sched_dl_ove if ((dl_policy(policy) || dl_task(p)) && sched_dl_ove retval = -EBUSY; retval = -EBUSY; goto unlock; goto unlock; } } p->sched_reset_on_fork = reset_on_fork; p->sched_reset_on_fork = reset_on_fork; oldprio = p->prio; oldprio = p->prio; newprio = __normal_prio(policy, attr->sched_priority, newprio = __normal_prio(policy, attr->sched_priority, if (pi) { if (pi) { /* /* * Take priority boosted tasks into account. * Take priority boosted tasks into account. * effective priority is unchanged, we just s * effective priority is unchanged, we just s * normal parameters and do not touch the sch * normal parameters and do not touch the sch * the runqueue. This will be done when the t * the runqueue. This will be done when the t * itself. * itself. */ */ newprio = rt_effective_prio(p, newprio); newprio = rt_effective_prio(p, newprio); if (newprio == oldprio) if (newprio == oldprio) queue_flags &= ~DEQUEUE_MOVE; queue_flags &= ~DEQUEUE_MOVE; } } queued = task_on_rq_queued(p); queued = task_on_rq_queued(p); running = task_current(rq, p); running = task_current(rq, p); if (queued) if (queued) dequeue_task(rq, p, queue_flags); dequeue_task(rq, p, queue_flags); if (running) if (running) put_prev_task(rq, p); put_prev_task(rq, p); prev_class = p->sched_class; prev_class = p->sched_class; if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { __setscheduler_params(p, attr); __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); __setscheduler_prio(p, newprio); } } __setscheduler_uclamp(p, attr); __setscheduler_uclamp(p, attr); if (queued) { if (queued) { /* /* * We enqueue to tail when the priority of a * We enqueue to tail when the priority of a * increased (user space view). * increased (user space view). */ */ if (oldprio < p->prio) if (oldprio < p->prio) queue_flags |= ENQUEUE_HEAD; queue_flags |= ENQUEUE_HEAD; enqueue_task(rq, p, queue_flags); enqueue_task(rq, p, queue_flags); } } if (running) if (running) set_next_task(rq, p); set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); check_class_changed(rq, p, prev_class, oldprio); /* Avoid rq from going away on us: */ /* Avoid rq from going away on us: */ preempt_disable(); preempt_disable(); head = splice_balance_callbacks(rq); head = splice_balance_callbacks(rq); task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); if (pi) { if (pi) { if (cpuset_locked) if (cpuset_locked) cpuset_unlock(); cpuset_unlock(); rt_mutex_adjust_pi(p); rt_mutex_adjust_pi(p); } } /* Run balance callbacks after we've adjusted the PI /* Run balance callbacks after we've adjusted the PI balance_callbacks(rq, head); balance_callbacks(rq, head); preempt_enable(); preempt_enable(); return 0; return 0; unlock: unlock: task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); if (cpuset_locked) if (cpuset_locked) cpuset_unlock(); cpuset_unlock(); return retval; return retval; } } static int _sched_setscheduler(struct task_struct *p, int pol static int _sched_setscheduler(struct task_struct *p, int pol const struct sched_param *para const struct sched_param *para { { struct sched_attr attr = { struct sched_attr attr = { .sched_policy = policy, .sched_policy = policy, .sched_priority = param->sched_priority, .sched_priority = param->sched_priority, .sched_nice = PRIO_TO_NICE(p->static_prio .sched_nice = PRIO_TO_NICE(p->static_prio }; }; /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RE if ((policy != SETPARAM_POLICY) && (policy & SCHED_RE attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; policy &= ~SCHED_RESET_ON_FORK; policy &= ~SCHED_RESET_ON_FORK; attr.sched_policy = policy; attr.sched_policy = policy; } } return __sched_setscheduler(p, &attr, check, true); return __sched_setscheduler(p, &attr, check, true); } } /** /** * sched_setscheduler - change the scheduling policy and/or R * sched_setscheduler - change the scheduling policy and/or R * @p: the task in question. * @p: the task in question. * @policy: new policy. * @policy: new policy. * @param: structure containing the new RT priority. * @param: structure containing the new RT priority. * * * Use sched_set_fifo(), read its comment. * Use sched_set_fifo(), read its comment. * * * Return: 0 on success. An error code otherwise. * Return: 0 on success. An error code otherwise. * * * NOTE that the task may be already dead. * NOTE that the task may be already dead. */ */ int sched_setscheduler(struct task_struct *p, int policy, int sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param) const struct sched_param *param) { { return _sched_setscheduler(p, policy, param, true); return _sched_setscheduler(p, policy, param, true); } } int sched_setattr(struct task_struct *p, const struct sched_a int sched_setattr(struct task_struct *p, const struct sched_a { { return __sched_setscheduler(p, attr, true, true); return __sched_setscheduler(p, attr, true, true); } } int sched_setattr_nocheck(struct task_struct *p, const struct int sched_setattr_nocheck(struct task_struct *p, const struct { { return __sched_setscheduler(p, attr, false, true); return __sched_setscheduler(p, attr, false, true); } } EXPORT_SYMBOL_GPL(sched_setattr_nocheck); EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** /** * sched_setscheduler_nocheck - change the scheduling policy * sched_setscheduler_nocheck - change the scheduling policy * @p: the task in question. * @p: the task in question. * @policy: new policy. * @policy: new policy. * @param: structure containing the new RT priority. * @param: structure containing the new RT priority. * * * Just like sched_setscheduler, only don't bother checking i * Just like sched_setscheduler, only don't bother checking i * current context has permission. For example, this is need * current context has permission. For example, this is need * stop_machine(): we create temporary high priority worker t * stop_machine(): we create temporary high priority worker t * but our caller might not have that capability. * but our caller might not have that capability. * * * Return: 0 on success. An error code otherwise. * Return: 0 on success. An error code otherwise. */ */ int sched_setscheduler_nocheck(struct task_struct *p, int pol int sched_setscheduler_nocheck(struct task_struct *p, int pol const struct sched_param *para const struct sched_param *para { { return _sched_setscheduler(p, policy, param, false); return _sched_setscheduler(p, policy, param, false); } } /* /* * SCHED_FIFO is a broken scheduler model; that is, it is fun * SCHED_FIFO is a broken scheduler model; that is, it is fun * incapable of resource management, which is the one thing a * incapable of resource management, which is the one thing a * be doing. * be doing. * * * This is of course the reason it is limited to privileged u * This is of course the reason it is limited to privileged u * * * Worse still; it is fundamentally impossible to compose sta * Worse still; it is fundamentally impossible to compose sta * workloads. You cannot take two correctly working static pr * workloads. You cannot take two correctly working static pr * and smash them together and still expect them to work. * and smash them together and still expect them to work. * * * For this reason 'all' FIFO tasks the kernel creates are ba * For this reason 'all' FIFO tasks the kernel creates are ba * * * MAX_RT_PRIO / 2 * MAX_RT_PRIO / 2 * * * The administrator _MUST_ configure the system, the kernel * The administrator _MUST_ configure the system, the kernel * know enough information to make a sensible choice. * know enough information to make a sensible choice. */ */ void sched_set_fifo(struct task_struct *p) void sched_set_fifo(struct task_struct *p) { { struct sched_param sp = { .sched_priority = MAX_RT_PR struct sched_param sp = { .sched_priority = MAX_RT_PR WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO } } EXPORT_SYMBOL_GPL(sched_set_fifo); EXPORT_SYMBOL_GPL(sched_set_fifo); /* /* * For when you don't much care about FIFO, but want to be ab * For when you don't much care about FIFO, but want to be ab */ */ void sched_set_fifo_low(struct task_struct *p) void sched_set_fifo_low(struct task_struct *p) { { struct sched_param sp = { .sched_priority = 1 }; struct sched_param sp = { .sched_priority = 1 }; WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO } } EXPORT_SYMBOL_GPL(sched_set_fifo_low); EXPORT_SYMBOL_GPL(sched_set_fifo_low); void sched_set_normal(struct task_struct *p, int nice) void sched_set_normal(struct task_struct *p, int nice) { { struct sched_attr attr = { struct sched_attr attr = { .sched_policy = SCHED_NORMAL, .sched_policy = SCHED_NORMAL, .sched_nice = nice, .sched_nice = nice, }; }; WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); } } EXPORT_SYMBOL_GPL(sched_set_normal); EXPORT_SYMBOL_GPL(sched_set_normal); static int static int do_sched_setscheduler(pid_t pid, int policy, struct sched_par do_sched_setscheduler(pid_t pid, int policy, struct sched_par { { struct sched_param lparam; struct sched_param lparam; struct task_struct *p; struct task_struct *p; int retval; int retval; if (!param || pid < 0) if (!param || pid < 0) return -EINVAL; return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sche if (copy_from_user(&lparam, param, sizeof(struct sche return -EFAULT; return -EFAULT; rcu_read_lock(); rcu_read_lock(); retval = -ESRCH; retval = -ESRCH; p = find_process_by_pid(pid); p = find_process_by_pid(pid); if (likely(p)) if (likely(p)) get_task_struct(p); get_task_struct(p); rcu_read_unlock(); rcu_read_unlock(); if (likely(p)) { if (likely(p)) { retval = sched_setscheduler(p, policy, &lpara retval = sched_setscheduler(p, policy, &lpara put_task_struct(p); put_task_struct(p); } } return retval; return retval; } } /* /* * Mimics kernel/events/core.c perf_copy_attr(). * Mimics kernel/events/core.c perf_copy_attr(). */ */ static int sched_copy_attr(struct sched_attr __user *uattr, s static int sched_copy_attr(struct sched_attr __user *uattr, s { { u32 size; u32 size; int ret; int ret; /* Zero the full structure, so that a short copy will /* Zero the full structure, so that a short copy will memset(attr, 0, sizeof(*attr)); memset(attr, 0, sizeof(*attr)); ret = get_user(size, &uattr->size); ret = get_user(size, &uattr->size); if (ret) if (ret) return ret; return ret; /* ABI compatibility quirk: */ /* ABI compatibility quirk: */ if (!size) if (!size) size = SCHED_ATTR_SIZE_VER0; size = SCHED_ATTR_SIZE_VER0; if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) goto err_size; goto err_size; ret = copy_struct_from_user(attr, sizeof(*attr), uatt ret = copy_struct_from_user(attr, sizeof(*attr), uatt if (ret) { if (ret) { if (ret == -E2BIG) if (ret == -E2BIG) goto err_size; goto err_size; return ret; return ret; } } if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && size < SCHED_ATTR_SIZE_VER1) size < SCHED_ATTR_SIZE_VER1) return -EINVAL; return -EINVAL; /* /* * XXX: Do we want to be lenient like existing syscal * XXX: Do we want to be lenient like existing syscal * to be strict and return an error on out-of-bounds * to be strict and return an error on out-of-bounds */ */ attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, return 0; return 0; err_size: err_size: put_user(sizeof(*attr), &uattr->size); put_user(sizeof(*attr), &uattr->size); return -E2BIG; return -E2BIG; } } static void get_params(struct task_struct *p, struct sched_at static void get_params(struct task_struct *p, struct sched_at { { if (task_has_dl_policy(p)) if (task_has_dl_policy(p)) __getparam_dl(p, attr); __getparam_dl(p, attr); else if (task_has_rt_policy(p)) else if (task_has_rt_policy(p)) attr->sched_priority = p->rt_priority; attr->sched_priority = p->rt_priority; else else attr->sched_nice = task_nice(p); attr->sched_nice = task_nice(p); } } /** /** * sys_sched_setscheduler - set/change the scheduler policy a * sys_sched_setscheduler - set/change the scheduler policy a * @pid: the pid in question. * @pid: the pid in question. * @policy: new policy. * @policy: new policy. * @param: structure containing the new RT priority. * @param: structure containing the new RT priority. * * * Return: 0 on success. An error code otherwise. * Return: 0 on success. An error code otherwise. */ */ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, { { if (policy < 0) if (policy < 0) return -EINVAL; return -EINVAL; return do_sched_setscheduler(pid, policy, param); return do_sched_setscheduler(pid, policy, param); } } /** /** * sys_sched_setparam - set/change the RT priority of a threa * sys_sched_setparam - set/change the RT priority of a threa * @pid: the pid in question. * @pid: the pid in question. * @param: structure containing the new RT priority. * @param: structure containing the new RT priority. * * * Return: 0 on success. An error code otherwise. * Return: 0 on success. An error code otherwise. */ */ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_para SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_para { { return do_sched_setscheduler(pid, SETPARAM_POLICY, pa return do_sched_setscheduler(pid, SETPARAM_POLICY, pa } } /** /** * sys_sched_setattr - same as above, but with extended sched * sys_sched_setattr - same as above, but with extended sched * @pid: the pid in question. * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @uattr: structure containing the extended parameters. * @flags: for future extension. * @flags: for future extension. */ */ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr unsigned int, flags) unsigned int, flags) { { struct sched_attr attr; struct sched_attr attr; struct task_struct *p; struct task_struct *p; int retval; int retval; if (!uattr || pid < 0 || flags) if (!uattr || pid < 0 || flags) return -EINVAL; return -EINVAL; retval = sched_copy_attr(uattr, &attr); retval = sched_copy_attr(uattr, &attr); if (retval) if (retval) return retval; return retval; if ((int)attr.sched_policy < 0) if ((int)attr.sched_policy < 0) return -EINVAL; return -EINVAL; if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) attr.sched_policy = SETPARAM_POLICY; attr.sched_policy = SETPARAM_POLICY; rcu_read_lock(); rcu_read_lock(); retval = -ESRCH; retval = -ESRCH; p = find_process_by_pid(pid); p = find_process_by_pid(pid); if (likely(p)) if (likely(p)) get_task_struct(p); get_task_struct(p); rcu_read_unlock(); rcu_read_unlock(); if (likely(p)) { if (likely(p)) { if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS get_params(p, &attr); get_params(p, &attr); retval = sched_setattr(p, &attr); retval = sched_setattr(p, &attr); put_task_struct(p); put_task_struct(p); } } return retval; return retval; } } /** /** * sys_sched_getscheduler - get the policy (scheduling class) * sys_sched_getscheduler - get the policy (scheduling class) * @pid: the pid in question. * @pid: the pid in question. * * * Return: On success, the policy of the thread. Otherwise, a * Return: On success, the policy of the thread. Otherwise, a * code. * code. */ */ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) { { struct task_struct *p; struct task_struct *p; int retval; int retval; if (pid < 0) if (pid < 0) return -EINVAL; return -EINVAL; retval = -ESRCH; retval = -ESRCH; rcu_read_lock(); rcu_read_lock(); p = find_process_by_pid(pid); p = find_process_by_pid(pid); if (p) { if (p) { retval = security_task_getscheduler(p); retval = security_task_getscheduler(p); if (!retval) if (!retval) retval = p->policy retval = p->policy | (p->sched_reset_on_fork ? S | (p->sched_reset_on_fork ? S } } rcu_read_unlock(); rcu_read_unlock(); return retval; return retval; } } /** /** * sys_sched_getparam - get the RT priority of a thread * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @pid: the pid in question. * @param: structure containing the RT priority. * @param: structure containing the RT priority. * * * Return: On success, 0 and the RT priority is in @param. Ot * Return: On success, 0 and the RT priority is in @param. Ot * code. * code. */ */ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_para SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_para { { struct sched_param lp = { .sched_priority = 0 }; struct sched_param lp = { .sched_priority = 0 }; struct task_struct *p; struct task_struct *p; int retval; int retval; if (!param || pid < 0) if (!param || pid < 0) return -EINVAL; return -EINVAL; rcu_read_lock(); rcu_read_lock(); p = find_process_by_pid(pid); p = find_process_by_pid(pid); retval = -ESRCH; retval = -ESRCH; if (!p) if (!p) goto out_unlock; goto out_unlock; retval = security_task_getscheduler(p); retval = security_task_getscheduler(p); if (retval) if (retval) goto out_unlock; goto out_unlock; if (task_has_rt_policy(p)) if (task_has_rt_policy(p)) lp.sched_priority = p->rt_priority; lp.sched_priority = p->rt_priority; rcu_read_unlock(); rcu_read_unlock(); /* /* * This one might sleep, we cannot do it with a spinl * This one might sleep, we cannot do it with a spinl */ */ retval = copy_to_user(param, &lp, sizeof(*param)) ? - retval = copy_to_user(param, &lp, sizeof(*param)) ? - return retval; return retval; out_unlock: out_unlock: rcu_read_unlock(); rcu_read_unlock(); return retval; return retval; } } /* /* * Copy the kernel size attribute structure (which might be l * Copy the kernel size attribute structure (which might be l * than what user-space knows about) to user-space. * than what user-space knows about) to user-space. * * * Note that all cases are valid: user-space buffer can be la * Note that all cases are valid: user-space buffer can be la * smaller than the kernel-space buffer. The usual case is th * smaller than the kernel-space buffer. The usual case is th * have the same size. * have the same size. */ */ static int static int sched_attr_copy_to_user(struct sched_attr __user *uattr, sched_attr_copy_to_user(struct sched_attr __user *uattr, struct sched_attr *kattr, struct sched_attr *kattr, unsigned int usize) unsigned int usize) { { unsigned int ksize = sizeof(*kattr); unsigned int ksize = sizeof(*kattr); if (!access_ok(uattr, usize)) if (!access_ok(uattr, usize)) return -EFAULT; return -EFAULT; /* /* * sched_getattr() ABI forwards and backwards compati * sched_getattr() ABI forwards and backwards compati * * * If usize == ksize then we just copy everything to * If usize == ksize then we just copy everything to * * * If usize < ksize then we only copy as much as user * If usize < ksize then we only copy as much as user * this keeps ABI compatibility as well. We skip the * this keeps ABI compatibility as well. We skip the * * * If usize > ksize then user-space is using a newer * If usize > ksize then user-space is using a newer * which part the kernel doesn't know about. Just ign * which part the kernel doesn't know about. Just ign * detect the kernel's knowledge of attributes from t * detect the kernel's knowledge of attributes from t * which is set to ksize in this case. * which is set to ksize in this case. */ */ kattr->size = min(usize, ksize); kattr->size = min(usize, ksize); if (copy_to_user(uattr, kattr, kattr->size)) if (copy_to_user(uattr, kattr, kattr->size)) return -EFAULT; return -EFAULT; return 0; return 0; } } /** /** * sys_sched_getattr - similar to sched_getparam, but with sc * sys_sched_getattr - similar to sched_getparam, but with sc * @pid: the pid in question. * @pid: the pid in question. * @uattr: structure containing the extended parameters. * @uattr: structure containing the extended parameters. * @usize: sizeof(attr) for fwd/bwd comp. * @usize: sizeof(attr) for fwd/bwd comp. * @flags: for future extension. * @flags: for future extension. */ */ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr unsigned int, usize, unsigned int, flags) unsigned int, usize, unsigned int, flags) { { struct sched_attr kattr = { }; struct sched_attr kattr = { }; struct task_struct *p; struct task_struct *p; int retval; int retval; if (!uattr || pid < 0 || usize > PAGE_SIZE || if (!uattr || pid < 0 || usize > PAGE_SIZE || usize < SCHED_ATTR_SIZE_VER0 || flags) usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; return -EINVAL; rcu_read_lock(); rcu_read_lock(); p = find_process_by_pid(pid); p = find_process_by_pid(pid); retval = -ESRCH; retval = -ESRCH; if (!p) if (!p) goto out_unlock; goto out_unlock; retval = security_task_getscheduler(p); retval = security_task_getscheduler(p); if (retval) if (retval) goto out_unlock; goto out_unlock; kattr.sched_policy = p->policy; kattr.sched_policy = p->policy; if (p->sched_reset_on_fork) if (p->sched_reset_on_fork) kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK get_params(p, &kattr); get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK #ifdef CONFIG_UCLAMP_TASK /* /* * This could race with another potential updater, bu * This could race with another potential updater, bu * because it'll correctly read the old or the new va * because it'll correctly read the old or the new va * to guarantee who wins the race as long as it doesn * to guarantee who wins the race as long as it doesn */ */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].valu kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].valu kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].valu kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].valu #endif #endif rcu_read_unlock(); rcu_read_unlock(); return sched_attr_copy_to_user(uattr, &kattr, usize); return sched_attr_copy_to_user(uattr, &kattr, usize); out_unlock: out_unlock: rcu_read_unlock(); rcu_read_unlock(); return retval; return retval; } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struc int dl_task_check_affinity(struct task_struct *p, const struc { { int ret = 0; int ret = 0; /* /* * If the task isn't a deadline task or admission con * If the task isn't a deadline task or admission con * disabled then we don't care about affinity changes * disabled then we don't care about affinity changes */ */ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled() if (!task_has_dl_policy(p) || !dl_bandwidth_enabled() return 0; return 0; /* /* * Since bandwidth control happens on root_domain bas * Since bandwidth control happens on root_domain bas * if admission test is enabled, we only admit -deadl * if admission test is enabled, we only admit -deadl * tasks allowed to run on all the CPUs in the task's * tasks allowed to run on all the CPUs in the task's * root_domain. * root_domain. */ */ rcu_read_lock(); rcu_read_lock(); if (!cpumask_subset(task_rq(p)->rd->span, mask)) if (!cpumask_subset(task_rq(p)->rd->span, mask)) ret = -EBUSY; ret = -EBUSY; rcu_read_unlock(); rcu_read_unlock(); return ret; return ret; } } #endif #endif static int static int __sched_setaffinity(struct task_struct *p, struct affinity_co __sched_setaffinity(struct task_struct *p, struct affinity_co { { int retval; int retval; cpumask_var_t cpus_allowed, new_mask; cpumask_var_t cpus_allowed, new_mask; if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) return -ENOMEM; return -ENOMEM; if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { retval = -ENOMEM; retval = -ENOMEM; goto out_free_cpus_allowed; goto out_free_cpus_allowed; } } cpuset_cpus_allowed(p, cpus_allowed); cpuset_cpus_allowed(p, cpus_allowed); cpumask_and(new_mask, ctx->new_mask, cpus_allowed); cpumask_and(new_mask, ctx->new_mask, cpus_allowed); ctx->new_mask = new_mask; ctx->new_mask = new_mask; ctx->flags |= SCA_CHECK; ctx->flags |= SCA_CHECK; retval = dl_task_check_affinity(p, new_mask); retval = dl_task_check_affinity(p, new_mask); if (retval) if (retval) goto out_free_new_mask; goto out_free_new_mask; retval = __set_cpus_allowed_ptr(p, ctx); retval = __set_cpus_allowed_ptr(p, ctx); if (retval) if (retval) goto out_free_new_mask; goto out_free_new_mask; cpuset_cpus_allowed(p, cpus_allowed); cpuset_cpus_allowed(p, cpus_allowed); if (!cpumask_subset(new_mask, cpus_allowed)) { if (!cpumask_subset(new_mask, cpus_allowed)) { /* /* * We must have raced with a concurrent cpuse * We must have raced with a concurrent cpuse * Just reset the cpumask to the cpuset's cpu * Just reset the cpumask to the cpuset's cpu */ */ cpumask_copy(new_mask, cpus_allowed); cpumask_copy(new_mask, cpus_allowed); /* /* * If SCA_USER is set, a 2nd call to __set_cp * If SCA_USER is set, a 2nd call to __set_cp * will restore the previous user_cpus_ptr va * will restore the previous user_cpus_ptr va * * * In the unlikely event a previous user_cpus * In the unlikely event a previous user_cpus * we need to further restrict the mask to wh * we need to further restrict the mask to wh * by that old user_cpus_ptr. * by that old user_cpus_ptr. */ */ if (unlikely((ctx->flags & SCA_USER) && ctx-> if (unlikely((ctx->flags & SCA_USER) && ctx-> bool empty = !cpumask_and(new_mask, n bool empty = !cpumask_and(new_mask, n ctx->user_m ctx->user_m if (WARN_ON_ONCE(empty)) if (WARN_ON_ONCE(empty)) cpumask_copy(new_mask, cpus_a cpumask_copy(new_mask, cpus_a } } __set_cpus_allowed_ptr(p, ctx); __set_cpus_allowed_ptr(p, ctx); retval = -EINVAL; retval = -EINVAL; } } out_free_new_mask: out_free_new_mask: free_cpumask_var(new_mask); free_cpumask_var(new_mask); out_free_cpus_allowed: out_free_cpus_allowed: free_cpumask_var(cpus_allowed); free_cpumask_var(cpus_allowed); return retval; return retval; } } long sched_setaffinity(pid_t pid, const struct cpumask *in_ma long sched_setaffinity(pid_t pid, const struct cpumask *in_ma { { struct affinity_context ac; struct affinity_context ac; struct cpumask *user_mask; struct cpumask *user_mask; struct task_struct *p; struct task_struct *p; int retval; int retval; rcu_read_lock(); rcu_read_lock(); p = find_process_by_pid(pid); p = find_process_by_pid(pid); if (!p) { if (!p) { rcu_read_unlock(); rcu_read_unlock(); return -ESRCH; return -ESRCH; } } /* Prevent p going away */ /* Prevent p going away */ get_task_struct(p); get_task_struct(p); rcu_read_unlock(); rcu_read_unlock(); if (p->flags & PF_NO_SETAFFINITY) { if (p->flags & PF_NO_SETAFFINITY) { retval = -EINVAL; retval = -EINVAL; goto out_put_task; goto out_put_task; } } if (!check_same_owner(p)) { if (!check_same_owner(p)) { rcu_read_lock(); rcu_read_lock(); if (!ns_capable(__task_cred(p)->user_ns, CAP_ if (!ns_capable(__task_cred(p)->user_ns, CAP_ rcu_read_unlock(); rcu_read_unlock(); retval = -EPERM; retval = -EPERM; goto out_put_task; goto out_put_task; } } rcu_read_unlock(); rcu_read_unlock(); } } retval = security_task_setscheduler(p); retval = security_task_setscheduler(p); if (retval) if (retval) goto out_put_task; goto out_put_task; /* /* * With non-SMP configs, user_cpus_ptr/user_mask isn' * With non-SMP configs, user_cpus_ptr/user_mask isn' * alloc_user_cpus_ptr() returns NULL. * alloc_user_cpus_ptr() returns NULL. */ */ user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE); if (user_mask) { if (user_mask) { cpumask_copy(user_mask, in_mask); cpumask_copy(user_mask, in_mask); } else if (IS_ENABLED(CONFIG_SMP)) { } else if (IS_ENABLED(CONFIG_SMP)) { retval = -ENOMEM; retval = -ENOMEM; goto out_put_task; goto out_put_task; } } ac = (struct affinity_context){ ac = (struct affinity_context){ .new_mask = in_mask, .new_mask = in_mask, .user_mask = user_mask, .user_mask = user_mask, .flags = SCA_USER, .flags = SCA_USER, }; }; retval = __sched_setaffinity(p, &ac); retval = __sched_setaffinity(p, &ac); kfree(ac.user_mask); kfree(ac.user_mask); out_put_task: out_put_task: put_task_struct(p); put_task_struct(p); return retval; return retval; } } static int get_user_cpu_mask(unsigned long __user *user_mask_ static int get_user_cpu_mask(unsigned long __user *user_mask_ struct cpumask *new_mask) struct cpumask *new_mask) { { if (len < cpumask_size()) if (len < cpumask_size()) cpumask_clear(new_mask); cpumask_clear(new_mask); else if (len > cpumask_size()) else if (len > cpumask_size()) len = cpumask_size(); len = cpumask_size(); return copy_from_user(new_mask, user_mask_ptr, len) ? return copy_from_user(new_mask, user_mask_ptr, len) ? } } /** /** * sys_sched_setaffinity - set the CPU affinity of a process * sys_sched_setaffinity - set the CPU affinity of a process * @pid: pid of the process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_ma * @len: length in bytes of the bitmask pointed to by user_ma * @user_mask_ptr: user-space pointer to the new CPU mask * @user_mask_ptr: user-space pointer to the new CPU mask * * * Return: 0 on success. An error code otherwise. * Return: 0 on success. An error code otherwise. */ */ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, unsigned long __user *, user_mask_ptr) unsigned long __user *, user_mask_ptr) { { cpumask_var_t new_mask; cpumask_var_t new_mask; int retval; int retval; if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) return -ENOMEM; return -ENOMEM; retval = get_user_cpu_mask(user_mask_ptr, len, new_ma retval = get_user_cpu_mask(user_mask_ptr, len, new_ma if (retval == 0) if (retval == 0) retval = sched_setaffinity(pid, new_mask); retval = sched_setaffinity(pid, new_mask); free_cpumask_var(new_mask); free_cpumask_var(new_mask); return retval; return retval; } } long sched_getaffinity(pid_t pid, struct cpumask *mask) long sched_getaffinity(pid_t pid, struct cpumask *mask) { { struct task_struct *p; struct task_struct *p; unsigned long flags; unsigned long flags; int retval; int retval; rcu_read_lock(); rcu_read_lock(); retval = -ESRCH; retval = -ESRCH; p = find_process_by_pid(pid); p = find_process_by_pid(pid); if (!p) if (!p) goto out_unlock; goto out_unlock; retval = security_task_getscheduler(p); retval = security_task_getscheduler(p); if (retval) if (retval) goto out_unlock; goto out_unlock; raw_spin_lock_irqsave(&p->pi_lock, flags); raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); raw_spin_unlock_irqrestore(&p->pi_lock, flags); raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: out_unlock: rcu_read_unlock(); rcu_read_unlock(); return retval; return retval; } } /** /** * sys_sched_getaffinity - get the CPU affinity of a process * sys_sched_getaffinity - get the CPU affinity of a process * @pid: pid of the process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_ma * @len: length in bytes of the bitmask pointed to by user_ma * @user_mask_ptr: user-space pointer to hold the current CPU * @user_mask_ptr: user-space pointer to hold the current CPU * * * Return: size of CPU mask copied to user_mask_ptr on succes * Return: size of CPU mask copied to user_mask_ptr on succes * error code otherwise. * error code otherwise. */ */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, unsigned long __user *, user_mask_ptr) unsigned long __user *, user_mask_ptr) { { int ret; int ret; cpumask_var_t mask; cpumask_var_t mask; if ((len * BITS_PER_BYTE) < nr_cpu_ids) if ((len * BITS_PER_BYTE) < nr_cpu_ids) return -EINVAL; return -EINVAL; if (len & (sizeof(unsigned long)-1)) if (len & (sizeof(unsigned long)-1)) return -EINVAL; return -EINVAL; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; return -ENOMEM; ret = sched_getaffinity(pid, mask); ret = sched_getaffinity(pid, mask); if (ret == 0) { if (ret == 0) { unsigned int retlen = min(len, cpumask_size() unsigned int retlen = min(len, cpumask_size() if (copy_to_user(user_mask_ptr, cpumask_bits( if (copy_to_user(user_mask_ptr, cpumask_bits( ret = -EFAULT; ret = -EFAULT; else else ret = retlen; ret = retlen; } } free_cpumask_var(mask); free_cpumask_var(mask); return ret; return ret; } } static void do_sched_yield(void) static void do_sched_yield(void) { { struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; rq = this_rq_lock_irq(&rf); rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); current->sched_class->yield_task(rq); preempt_disable(); preempt_disable(); rq_unlock_irq(rq, &rf); rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); sched_preempt_enable_no_resched(); schedule(); schedule(); } } /** /** * sys_sched_yield - yield the current processor to other thr * sys_sched_yield - yield the current processor to other thr * * * This function yields the current CPU to other tasks. If th * This function yields the current CPU to other tasks. If th * other threads running on this CPU then this function will * other threads running on this CPU then this function will * * * Return: 0. * Return: 0. */ */ SYSCALL_DEFINE0(sched_yield) SYSCALL_DEFINE0(sched_yield) { { do_sched_yield(); do_sched_yield(); return 0; return 0; } } #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYN #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYN int __sched __cond_resched(void) int __sched __cond_resched(void) { { if (should_resched(0)) { if (should_resched(0)) { preempt_schedule_common(); preempt_schedule_common(); return 1; return 1; } } /* /* * In preemptible kernels, ->rcu_read_lock_nesting te * In preemptible kernels, ->rcu_read_lock_nesting te * whether the current CPU is in an RCU read-side cri * whether the current CPU is in an RCU read-side cri * so the tick can report quiescent states even for C * so the tick can report quiescent states even for C * in kernel context. In contrast, in non-preemptibl * in kernel context. In contrast, in non-preemptibl * RCU readers leave no in-memory hints, which means * RCU readers leave no in-memory hints, which means * processes executing in kernel context might never * processes executing in kernel context might never * RCU quiescent state. Therefore, the following cod * RCU quiescent state. Therefore, the following cod * cond_resched() to report a quiescent state, but on * cond_resched() to report a quiescent state, but on * is in urgent need of one. * is in urgent need of one. */ */ #ifndef CONFIG_PREEMPT_RCU #ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); rcu_all_qs(); #endif #endif return 0; return 0; } } EXPORT_SYMBOL(__cond_resched); EXPORT_SYMBOL(__cond_resched); #endif #endif #ifdef CONFIG_PREEMPT_DYNAMIC #ifdef CONFIG_PREEMPT_DYNAMIC #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define cond_resched_dynamic_enabled __cond_resched #define cond_resched_dynamic_enabled __cond_resched #define cond_resched_dynamic_disabled ((void *)&__static_ca #define cond_resched_dynamic_disabled ((void *)&__static_ca DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); EXPORT_STATIC_CALL_TRAMP(cond_resched); #define might_resched_dynamic_enabled __cond_resched #define might_resched_dynamic_enabled __cond_resched #define might_resched_dynamic_disabled ((void *)&__static_ca #define might_resched_dynamic_disabled ((void *)&__static_ca DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); EXPORT_STATIC_CALL_TRAMP(might_resched); #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched); int __sched dynamic_cond_resched(void) int __sched dynamic_cond_resched(void) { { klp_sched_try_switch(); klp_sched_try_switch(); if (!static_branch_unlikely(&sk_dynamic_cond_resched) if (!static_branch_unlikely(&sk_dynamic_cond_resched) return 0; return 0; return __cond_resched(); return __cond_resched(); } } EXPORT_SYMBOL(dynamic_cond_resched); EXPORT_SYMBOL(dynamic_cond_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched); int __sched dynamic_might_resched(void) int __sched dynamic_might_resched(void) { { if (!static_branch_unlikely(&sk_dynamic_might_resched if (!static_branch_unlikely(&sk_dynamic_might_resched return 0; return 0; return __cond_resched(); return __cond_resched(); } } EXPORT_SYMBOL(dynamic_might_resched); EXPORT_SYMBOL(dynamic_might_resched); #endif #endif #endif #endif /* /* * __cond_resched_lock() - if a reschedule is pending, drop t * __cond_resched_lock() - if a reschedule is pending, drop t * call schedule, and on return reacquire the lock. * call schedule, and on return reacquire the lock. * * * This works OK both with and without CONFIG_PREEMPTION. We * This works OK both with and without CONFIG_PREEMPTION. We * operations here to prevent schedule() from being called tw * operations here to prevent schedule() from being called tw * spin_unlock(), once by hand). * spin_unlock(), once by hand). */ */ int __cond_resched_lock(spinlock_t *lock) int __cond_resched_lock(spinlock_t *lock) { { int resched = should_resched(PREEMPT_LOCK_OFFSET); int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; int ret = 0; lockdep_assert_held(lock); lockdep_assert_held(lock); if (spin_needbreak(lock) || resched) { if (spin_needbreak(lock) || resched) { spin_unlock(lock); spin_unlock(lock); if (!_cond_resched()) if (!_cond_resched()) cpu_relax(); cpu_relax(); ret = 1; ret = 1; spin_lock(lock); spin_lock(lock); } } return ret; return ret; } } EXPORT_SYMBOL(__cond_resched_lock); EXPORT_SYMBOL(__cond_resched_lock); int __cond_resched_rwlock_read(rwlock_t *lock) int __cond_resched_rwlock_read(rwlock_t *lock) { { int resched = should_resched(PREEMPT_LOCK_OFFSET); int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; int ret = 0; lockdep_assert_held_read(lock); lockdep_assert_held_read(lock); if (rwlock_needbreak(lock) || resched) { if (rwlock_needbreak(lock) || resched) { read_unlock(lock); read_unlock(lock); if (!_cond_resched()) if (!_cond_resched()) cpu_relax(); cpu_relax(); ret = 1; ret = 1; read_lock(lock); read_lock(lock); } } return ret; return ret; } } EXPORT_SYMBOL(__cond_resched_rwlock_read); EXPORT_SYMBOL(__cond_resched_rwlock_read); int __cond_resched_rwlock_write(rwlock_t *lock) int __cond_resched_rwlock_write(rwlock_t *lock) { { int resched = should_resched(PREEMPT_LOCK_OFFSET); int resched = should_resched(PREEMPT_LOCK_OFFSET); int ret = 0; int ret = 0; lockdep_assert_held_write(lock); lockdep_assert_held_write(lock); if (rwlock_needbreak(lock) || resched) { if (rwlock_needbreak(lock) || resched) { write_unlock(lock); write_unlock(lock); if (!_cond_resched()) if (!_cond_resched()) cpu_relax(); cpu_relax(); ret = 1; ret = 1; write_lock(lock); write_lock(lock); } } return ret; return ret; } } EXPORT_SYMBOL(__cond_resched_rwlock_write); EXPORT_SYMBOL(__cond_resched_rwlock_write); #ifdef CONFIG_PREEMPT_DYNAMIC #ifdef CONFIG_PREEMPT_DYNAMIC #ifdef CONFIG_GENERIC_ENTRY #ifdef CONFIG_GENERIC_ENTRY #include #include #endif #endif /* /* * SC:cond_resched * SC:cond_resched * SC:might_resched * SC:might_resched * SC:preempt_schedule * SC:preempt_schedule * SC:preempt_schedule_notrace * SC:preempt_schedule_notrace * SC:irqentry_exit_cond_resched * SC:irqentry_exit_cond_resched * * * * * NONE: * NONE: * cond_resched <- __cond_resched * cond_resched <- __cond_resched * might_resched <- RET0 * might_resched <- RET0 * preempt_schedule <- NOP * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP * irqentry_exit_cond_resched <- NOP * * * VOLUNTARY: * VOLUNTARY: * cond_resched <- __cond_resched * cond_resched <- __cond_resched * might_resched <- __cond_resched * might_resched <- __cond_resched * preempt_schedule <- NOP * preempt_schedule <- NOP * preempt_schedule_notrace <- NOP * preempt_schedule_notrace <- NOP * irqentry_exit_cond_resched <- NOP * irqentry_exit_cond_resched <- NOP * * * FULL: * FULL: * cond_resched <- RET0 * cond_resched <- RET0 * might_resched <- RET0 * might_resched <- RET0 * preempt_schedule <- preempt_schedule * preempt_schedule <- preempt_schedule * preempt_schedule_notrace <- preempt_schedule_notrace * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched * irqentry_exit_cond_resched <- irqentry_exit_cond_resched */ */ enum { enum { preempt_dynamic_undefined = -1, preempt_dynamic_undefined = -1, preempt_dynamic_none, preempt_dynamic_none, preempt_dynamic_voluntary, preempt_dynamic_voluntary, preempt_dynamic_full, preempt_dynamic_full, }; }; int preempt_dynamic_mode = preempt_dynamic_undefined; int preempt_dynamic_mode = preempt_dynamic_undefined; int sched_dynamic_mode(const char *str) int sched_dynamic_mode(const char *str) { { if (!strcmp(str, "none")) if (!strcmp(str, "none")) return preempt_dynamic_none; return preempt_dynamic_none; if (!strcmp(str, "voluntary")) if (!strcmp(str, "voluntary")) return preempt_dynamic_voluntary; return preempt_dynamic_voluntary; if (!strcmp(str, "full")) if (!strcmp(str, "full")) return preempt_dynamic_full; return preempt_dynamic_full; return -EINVAL; return -EINVAL; } } #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) #define preempt_dynamic_enable(f) static_call_update(f, #define preempt_dynamic_enable(f) static_call_update(f, #define preempt_dynamic_disable(f) static_call_update(f, #define preempt_dynamic_disable(f) static_call_update(f, #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) #define preempt_dynamic_enable(f) static_key_enable(&sk #define preempt_dynamic_enable(f) static_key_enable(&sk #define preempt_dynamic_disable(f) static_key_disable(&s #define preempt_dynamic_disable(f) static_key_disable(&s #else #else #error "Unsupported PREEMPT_DYNAMIC mechanism" #error "Unsupported PREEMPT_DYNAMIC mechanism" #endif #endif static DEFINE_MUTEX(sched_dynamic_mutex); static DEFINE_MUTEX(sched_dynamic_mutex); static bool klp_override; static bool klp_override; static void __sched_dynamic_update(int mode) static void __sched_dynamic_update(int mode) { { /* /* * Avoid {NONE,VOLUNTARY} -> FULL transitions from ev * Avoid {NONE,VOLUNTARY} -> FULL transitions from ev * the ZERO state, which is invalid. * the ZERO state, which is invalid. */ */ if (!klp_override) if (!klp_override) preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(preempt_schedule_notrace); preempt_dynamic_enable(irqentry_exit_cond_resched); preempt_dynamic_enable(irqentry_exit_cond_resched); switch (mode) { switch (mode) { case preempt_dynamic_none: case preempt_dynamic_none: if (!klp_override) if (!klp_override) preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(cond_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notr preempt_dynamic_disable(preempt_schedule_notr preempt_dynamic_disable(irqentry_exit_cond_re preempt_dynamic_disable(irqentry_exit_cond_re if (mode != preempt_dynamic_mode) if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: none\n"); pr_info("Dynamic Preempt: none\n"); break; break; case preempt_dynamic_voluntary: case preempt_dynamic_voluntary: if (!klp_override) if (!klp_override) preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(cond_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_enable(might_resched); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule); preempt_dynamic_disable(preempt_schedule_notr preempt_dynamic_disable(preempt_schedule_notr preempt_dynamic_disable(irqentry_exit_cond_re preempt_dynamic_disable(irqentry_exit_cond_re if (mode != preempt_dynamic_mode) if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: voluntary\n pr_info("Dynamic Preempt: voluntary\n break; break; case preempt_dynamic_full: case preempt_dynamic_full: if (!klp_override) if (!klp_override) preempt_dynamic_disable(cond_resched) preempt_dynamic_disable(cond_resched) preempt_dynamic_disable(might_resched); preempt_dynamic_disable(might_resched); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule); preempt_dynamic_enable(preempt_schedule_notra preempt_dynamic_enable(preempt_schedule_notra preempt_dynamic_enable(irqentry_exit_cond_res preempt_dynamic_enable(irqentry_exit_cond_res if (mode != preempt_dynamic_mode) if (mode != preempt_dynamic_mode) pr_info("Dynamic Preempt: full\n"); pr_info("Dynamic Preempt: full\n"); break; break; } } preempt_dynamic_mode = mode; preempt_dynamic_mode = mode; } } void sched_dynamic_update(int mode) void sched_dynamic_update(int mode) { { mutex_lock(&sched_dynamic_mutex); mutex_lock(&sched_dynamic_mutex); __sched_dynamic_update(mode); __sched_dynamic_update(mode); mutex_unlock(&sched_dynamic_mutex); mutex_unlock(&sched_dynamic_mutex); } } #ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL #ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL static int klp_cond_resched(void) static int klp_cond_resched(void) { { __klp_sched_try_switch(); __klp_sched_try_switch(); return __cond_resched(); return __cond_resched(); } } void sched_dynamic_klp_enable(void) void sched_dynamic_klp_enable(void) { { mutex_lock(&sched_dynamic_mutex); mutex_lock(&sched_dynamic_mutex); klp_override = true; klp_override = true; static_call_update(cond_resched, klp_cond_resched); static_call_update(cond_resched, klp_cond_resched); mutex_unlock(&sched_dynamic_mutex); mutex_unlock(&sched_dynamic_mutex); } } void sched_dynamic_klp_disable(void) void sched_dynamic_klp_disable(void) { { mutex_lock(&sched_dynamic_mutex); mutex_lock(&sched_dynamic_mutex); klp_override = false; klp_override = false; __sched_dynamic_update(preempt_dynamic_mode); __sched_dynamic_update(preempt_dynamic_mode); mutex_unlock(&sched_dynamic_mutex); mutex_unlock(&sched_dynamic_mutex); } } #endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ static int __init setup_preempt_mode(char *str) static int __init setup_preempt_mode(char *str) { { int mode = sched_dynamic_mode(str); int mode = sched_dynamic_mode(str); if (mode < 0) { if (mode < 0) { pr_warn("Dynamic Preempt: unsupported mode: % pr_warn("Dynamic Preempt: unsupported mode: % return 0; return 0; } } sched_dynamic_update(mode); sched_dynamic_update(mode); return 1; return 1; } } __setup("preempt=", setup_preempt_mode); __setup("preempt=", setup_preempt_mode); static void __init preempt_dynamic_init(void) static void __init preempt_dynamic_init(void) { { if (preempt_dynamic_mode == preempt_dynamic_undefined if (preempt_dynamic_mode == preempt_dynamic_undefined if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { if (IS_ENABLED(CONFIG_PREEMPT_NONE)) { sched_dynamic_update(preempt_dynamic_ sched_dynamic_update(preempt_dynamic_ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTAR } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTAR sched_dynamic_update(preempt_dynamic_ sched_dynamic_update(preempt_dynamic_ } else { } else { /* Default static call setting, nothi /* Default static call setting, nothi WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEM WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEM preempt_dynamic_mode = preempt_dynami preempt_dynamic_mode = preempt_dynami pr_info("Dynamic Preempt: full\n"); pr_info("Dynamic Preempt: full\n"); } } } } } } #define PREEMPT_MODEL_ACCESSOR(mode) \ #define PREEMPT_MODEL_ACCESSOR(mode) \ bool preempt_model_##mode(void) bool preempt_model_##mode(void) { { WARN_ON_ONCE(preempt_dynamic_mode == preempt_ WARN_ON_ONCE(preempt_dynamic_mode == preempt_ return preempt_dynamic_mode == preempt_dynami return preempt_dynamic_mode == preempt_dynami } } EXPORT_SYMBOL_GPL(preempt_model_##mode) EXPORT_SYMBOL_GPL(preempt_model_##mode) PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(none); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(voluntary); PREEMPT_MODEL_ACCESSOR(full); PREEMPT_MODEL_ACCESSOR(full); #else /* !CONFIG_PREEMPT_DYNAMIC */ #else /* !CONFIG_PREEMPT_DYNAMIC */ static inline void preempt_dynamic_init(void) { } static inline void preempt_dynamic_init(void) { } #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ #endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */ /* < * task_is_pi_boosted - Check if task has been PI boosted. < * @p: Task to check. < * < * Return true if task is subject to priority inheritance. < */ < bool task_is_pi_boosted(const struct task_struct *p) < { < int prio = p->prio; < < if (!rt_prio(prio)) < return false; < return prio != p->normal_prio; < } < < /** /** * yield - yield the current processor to other threads. * yield - yield the current processor to other threads. * * * Do not ever use this function, there's a 99% chance you're * Do not ever use this function, there's a 99% chance you're * * * The scheduler is at all times free to pick the calling tas * The scheduler is at all times free to pick the calling tas * eligible task to run, if removing the yield() call from yo * eligible task to run, if removing the yield() call from yo * it, it's already broken. * it, it's already broken. * * * Typical broken usage is: * Typical broken usage is: * * * while (!event) * while (!event) * yield(); * yield(); * * * where one assumes that yield() will let 'the other' proces * where one assumes that yield() will let 'the other' proces * make event true. If the current task is a SCHED_FIFO task * make event true. If the current task is a SCHED_FIFO task * happen. Never use yield() as a progress guarantee!! * happen. Never use yield() as a progress guarantee!! * * * If you want to use yield() to wait for something, use wait * If you want to use yield() to wait for something, use wait * If you want to use yield() to be 'nice' for others, use co * If you want to use yield() to be 'nice' for others, use co * If you still want to use yield(), do not! * If you still want to use yield(), do not! */ */ void __sched yield(void) void __sched yield(void) { { set_current_state(TASK_RUNNING); set_current_state(TASK_RUNNING); do_sched_yield(); do_sched_yield(); } } EXPORT_SYMBOL(yield); EXPORT_SYMBOL(yield); /** /** * yield_to - yield the current processor to another thread i * yield_to - yield the current processor to another thread i * your thread group, or accelerate that thread toward the * your thread group, or accelerate that thread toward the * processor it's on. * processor it's on. * @p: target task * @p: target task * @preempt: whether task preemption is allowed or not * @preempt: whether task preemption is allowed or not * * * It's the caller's job to ensure that the target task struc * It's the caller's job to ensure that the target task struc * can't go away on us before we can do any checks. * can't go away on us before we can do any checks. * * * Return: * Return: * true (>0) if we indeed boosted the target task. * true (>0) if we indeed boosted the target task. * false (0) if we failed to boost the target. * false (0) if we failed to boost the target. * -ESRCH if there's no task to yield to. * -ESRCH if there's no task to yield to. */ */ int __sched yield_to(struct task_struct *p, bool preempt) int __sched yield_to(struct task_struct *p, bool preempt) { { struct task_struct *curr = current; struct task_struct *curr = current; struct rq *rq, *p_rq; struct rq *rq, *p_rq; unsigned long flags; unsigned long flags; int yielded = 0; int yielded = 0; local_irq_save(flags); local_irq_save(flags); rq = this_rq(); rq = this_rq(); again: again: p_rq = task_rq(p); p_rq = task_rq(p); /* /* * If we're the only runnable task on the rq and targ * If we're the only runnable task on the rq and targ * has only one task, there's absolutely no point in * has only one task, there's absolutely no point in */ */ if (rq->nr_running == 1 && p_rq->nr_running == 1) { if (rq->nr_running == 1 && p_rq->nr_running == 1) { yielded = -ESRCH; yielded = -ESRCH; goto out_irq; goto out_irq; } } double_rq_lock(rq, p_rq); double_rq_lock(rq, p_rq); if (task_rq(p) != p_rq) { if (task_rq(p) != p_rq) { double_rq_unlock(rq, p_rq); double_rq_unlock(rq, p_rq); goto again; goto again; } } if (!curr->sched_class->yield_to_task) if (!curr->sched_class->yield_to_task) goto out_unlock; goto out_unlock; if (curr->sched_class != p->sched_class) if (curr->sched_class != p->sched_class) goto out_unlock; goto out_unlock; if (task_on_cpu(p_rq, p) || !task_is_running(p)) if (task_on_cpu(p_rq, p) || !task_is_running(p)) goto out_unlock; goto out_unlock; yielded = curr->sched_class->yield_to_task(rq, p); yielded = curr->sched_class->yield_to_task(rq, p); if (yielded) { if (yielded) { schedstat_inc(rq->yld_count); schedstat_inc(rq->yld_count); /* /* * Make p's CPU reschedule; pick_next_entity * Make p's CPU reschedule; pick_next_entity * fairness. * fairness. */ */ if (preempt && rq != p_rq) if (preempt && rq != p_rq) resched_curr(p_rq); resched_curr(p_rq); } } out_unlock: out_unlock: double_rq_unlock(rq, p_rq); double_rq_unlock(rq, p_rq); out_irq: out_irq: local_irq_restore(flags); local_irq_restore(flags); if (yielded > 0) if (yielded > 0) schedule(); schedule(); return yielded; return yielded; } } EXPORT_SYMBOL_GPL(yield_to); EXPORT_SYMBOL_GPL(yield_to); int io_schedule_prepare(void) int io_schedule_prepare(void) { { int old_iowait = current->in_iowait; int old_iowait = current->in_iowait; current->in_iowait = 1; current->in_iowait = 1; blk_flush_plug(current->plug, true); blk_flush_plug(current->plug, true); return old_iowait; return old_iowait; } } void io_schedule_finish(int token) void io_schedule_finish(int token) { { current->in_iowait = token; current->in_iowait = token; } } /* /* * This task is about to go to sleep on IO. Increment rq->nr_ * This task is about to go to sleep on IO. Increment rq->nr_ * that process accounting knows that this is a task in IO wa * that process accounting knows that this is a task in IO wa */ */ long __sched io_schedule_timeout(long timeout) long __sched io_schedule_timeout(long timeout) { { int token; int token; long ret; long ret; token = io_schedule_prepare(); token = io_schedule_prepare(); ret = schedule_timeout(timeout); ret = schedule_timeout(timeout); io_schedule_finish(token); io_schedule_finish(token); return ret; return ret; } } EXPORT_SYMBOL(io_schedule_timeout); EXPORT_SYMBOL(io_schedule_timeout); void __sched io_schedule(void) void __sched io_schedule(void) { { int token; int token; token = io_schedule_prepare(); token = io_schedule_prepare(); schedule(); schedule(); io_schedule_finish(token); io_schedule_finish(token); } } EXPORT_SYMBOL(io_schedule); EXPORT_SYMBOL(io_schedule); /** /** * sys_sched_get_priority_max - return maximum RT priority. * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. * @policy: scheduling class. * * * Return: On success, this syscall returns the maximum * Return: On success, this syscall returns the maximum * rt_priority that can be used by a given scheduling class. * rt_priority that can be used by a given scheduling class. * On failure, a negative error code is returned. * On failure, a negative error code is returned. */ */ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) SYSCALL_DEFINE1(sched_get_priority_max, int, policy) { { int ret = -EINVAL; int ret = -EINVAL; switch (policy) { switch (policy) { case SCHED_FIFO: case SCHED_FIFO: case SCHED_RR: case SCHED_RR: ret = MAX_RT_PRIO-1; ret = MAX_RT_PRIO-1; break; break; case SCHED_DEADLINE: case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_BATCH: case SCHED_IDLE: case SCHED_IDLE: ret = 0; ret = 0; break; break; } } return ret; return ret; } } /** /** * sys_sched_get_priority_min - return minimum RT priority. * sys_sched_get_priority_min - return minimum RT priority. * @policy: scheduling class. * @policy: scheduling class. * * * Return: On success, this syscall returns the minimum * Return: On success, this syscall returns the minimum * rt_priority that can be used by a given scheduling class. * rt_priority that can be used by a given scheduling class. * On failure, a negative error code is returned. * On failure, a negative error code is returned. */ */ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) SYSCALL_DEFINE1(sched_get_priority_min, int, policy) { { int ret = -EINVAL; int ret = -EINVAL; switch (policy) { switch (policy) { case SCHED_FIFO: case SCHED_FIFO: case SCHED_RR: case SCHED_RR: ret = 1; ret = 1; break; break; case SCHED_DEADLINE: case SCHED_DEADLINE: case SCHED_NORMAL: case SCHED_NORMAL: case SCHED_BATCH: case SCHED_BATCH: case SCHED_IDLE: case SCHED_IDLE: ret = 0; ret = 0; } } return ret; return ret; } } static int sched_rr_get_interval(pid_t pid, struct timespec64 static int sched_rr_get_interval(pid_t pid, struct timespec64 { { struct task_struct *p; struct task_struct *p; unsigned int time_slice; unsigned int time_slice; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; int retval; int retval; if (pid < 0) if (pid < 0) return -EINVAL; return -EINVAL; retval = -ESRCH; retval = -ESRCH; rcu_read_lock(); rcu_read_lock(); p = find_process_by_pid(pid); p = find_process_by_pid(pid); if (!p) if (!p) goto out_unlock; goto out_unlock; retval = security_task_getscheduler(p); retval = security_task_getscheduler(p); if (retval) if (retval) goto out_unlock; goto out_unlock; rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); time_slice = 0; time_slice = 0; if (p->sched_class->get_rr_interval) if (p->sched_class->get_rr_interval) time_slice = p->sched_class->get_rr_interval( time_slice = p->sched_class->get_rr_interval( task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); rcu_read_unlock(); rcu_read_unlock(); jiffies_to_timespec64(time_slice, t); jiffies_to_timespec64(time_slice, t); return 0; return 0; out_unlock: out_unlock: rcu_read_unlock(); rcu_read_unlock(); return retval; return retval; } } /** /** * sys_sched_rr_get_interval - return the default timeslice o * sys_sched_rr_get_interval - return the default timeslice o * @pid: pid of the process. * @pid: pid of the process. * @interval: userspace pointer to the timeslice value. * @interval: userspace pointer to the timeslice value. * * * this syscall writes the default timeslice value of a given * this syscall writes the default timeslice value of a given * into the user-space timespec buffer. A value of '0' means * into the user-space timespec buffer. A value of '0' means * * * Return: On success, 0 and the timeslice is in @interval. O * Return: On success, 0 and the timeslice is in @interval. O * an error code. * an error code. */ */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct __kernel_timespec __user *, interval) struct __kernel_timespec __user *, interval) { { struct timespec64 t; struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); int retval = sched_rr_get_interval(pid, &t); if (retval == 0) if (retval == 0) retval = put_timespec64(&t, interval); retval = put_timespec64(&t, interval); return retval; return retval; } } #ifdef CONFIG_COMPAT_32BIT_TIME #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, struct old_timespec32 __user *, interval) struct old_timespec32 __user *, interval) { { struct timespec64 t; struct timespec64 t; int retval = sched_rr_get_interval(pid, &t); int retval = sched_rr_get_interval(pid, &t); if (retval == 0) if (retval == 0) retval = put_old_timespec32(&t, interval); retval = put_old_timespec32(&t, interval); return retval; return retval; } } #endif #endif void sched_show_task(struct task_struct *p) void sched_show_task(struct task_struct *p) { { unsigned long free = 0; unsigned long free = 0; int ppid; int ppid; if (!try_get_task_stack(p)) if (!try_get_task_stack(p)) return; return; pr_info("task:%-15.15s state:%c", p->comm, task_state pr_info("task:%-15.15s state:%c", p->comm, task_state if (task_is_running(p)) if (task_is_running(p)) pr_cont(" running task "); pr_cont(" running task "); #ifdef CONFIG_DEBUG_STACK_USAGE #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); free = stack_not_used(p); #endif #endif ppid = 0; ppid = 0; rcu_read_lock(); rcu_read_lock(); if (pid_alive(p)) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_pa ppid = task_pid_nr(rcu_dereference(p->real_pa rcu_read_unlock(); rcu_read_unlock(); pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08l pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08l free, task_pid_nr(p), ppid, free, task_pid_nr(p), ppid, read_task_thread_flags(p)); read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); show_stack(p, NULL, KERN_INFO); put_task_stack(p); put_task_stack(p); } } EXPORT_SYMBOL_GPL(sched_show_task); EXPORT_SYMBOL_GPL(sched_show_task); static inline bool static inline bool state_filter_match(unsigned long state_filter, struct task_st state_filter_match(unsigned long state_filter, struct task_st { { unsigned int state = READ_ONCE(p->__state); unsigned int state = READ_ONCE(p->__state); /* no filter, everything matches */ /* no filter, everything matches */ if (!state_filter) if (!state_filter) return true; return true; /* filter, but doesn't match */ /* filter, but doesn't match */ if (!(state & state_filter)) if (!(state & state_filter)) return false; return false; /* /* * When looking for TASK_UNINTERRUPTIBLE skip TASK_ID * When looking for TASK_UNINTERRUPTIBLE skip TASK_ID * TASK_KILLABLE). * TASK_KILLABLE). */ */ if (state_filter == TASK_UNINTERRUPTIBLE && (state & if (state_filter == TASK_UNINTERRUPTIBLE && (state & return false; return false; return true; return true; } } void show_state_filter(unsigned int state_filter) void show_state_filter(unsigned int state_filter) { { struct task_struct *g, *p; struct task_struct *g, *p; rcu_read_lock(); rcu_read_lock(); for_each_process_thread(g, p) { for_each_process_thread(g, p) { /* /* * reset the NMI-timeout, listing all files o * reset the NMI-timeout, listing all files o * console might take a lot of time: * console might take a lot of time: * Also, reset softlockup watchdogs on all CP * Also, reset softlockup watchdogs on all CP * another CPU might be blocked waiting for u * another CPU might be blocked waiting for u * an IPI. * an IPI. */ */ touch_nmi_watchdog(); touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); touch_all_softlockup_watchdogs(); if (state_filter_match(state_filter, p)) if (state_filter_match(state_filter, p)) sched_show_task(p); sched_show_task(p); } } #ifdef CONFIG_SCHED_DEBUG #ifdef CONFIG_SCHED_DEBUG if (!state_filter) if (!state_filter) sysrq_sched_debug_show(); sysrq_sched_debug_show(); #endif #endif rcu_read_unlock(); rcu_read_unlock(); /* /* * Only show locks if all tasks are dumped: * Only show locks if all tasks are dumped: */ */ if (!state_filter) if (!state_filter) debug_show_all_locks(); debug_show_all_locks(); } } /** /** * init_idle - set up an idle thread for a given CPU * init_idle - set up an idle thread for a given CPU * @idle: task in question * @idle: task in question * @cpu: CPU the idle task belongs to * @cpu: CPU the idle task belongs to * * * NOTE: this function does not set the idle thread's NEED_RE * NOTE: this function does not set the idle thread's NEED_RE * flag, to make booting more robust. * flag, to make booting more robust. */ */ void __init init_idle(struct task_struct *idle, int cpu) void __init init_idle(struct task_struct *idle, int cpu) { { #ifdef CONFIG_SMP #ifdef CONFIG_SMP struct affinity_context ac = (struct affinity_context struct affinity_context ac = (struct affinity_context .new_mask = cpumask_of(cpu), .new_mask = cpumask_of(cpu), .flags = 0, .flags = 0, }; }; #endif #endif struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); unsigned long flags; unsigned long flags; __sched_fork(0, idle); __sched_fork(0, idle); raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_rq_lock(rq); raw_spin_rq_lock(rq); idle->__state = TASK_RUNNING; idle->__state = TASK_RUNNING; idle->se.exec_start = sched_clock(); idle->se.exec_start = sched_clock(); /* /* * PF_KTHREAD should already be set at this point; re * PF_KTHREAD should already be set at this point; re * look like a proper per-CPU kthread. * look like a proper per-CPU kthread. */ */ idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY; kthread_set_per_cpu(idle, cpu); kthread_set_per_cpu(idle, cpu); #ifdef CONFIG_SMP #ifdef CONFIG_SMP /* /* * It's possible that init_idle() gets called multipl * It's possible that init_idle() gets called multipl * in that case do_set_cpus_allowed() will not do the * in that case do_set_cpus_allowed() will not do the * * * And since this is boot we can forgo the serializat * And since this is boot we can forgo the serializat */ */ set_cpus_allowed_common(idle, &ac); set_cpus_allowed_common(idle, &ac); #endif #endif /* /* * We're having a chicken and egg problem, even thoug * We're having a chicken and egg problem, even thoug * holding rq->lock, the CPU isn't yet set to this CP * holding rq->lock, the CPU isn't yet set to this CP * lockdep check in task_group() will fail. * lockdep check in task_group() will fail. * * * Similar case to sched_fork(). / Alternatively we c * Similar case to sched_fork(). / Alternatively we c * use task_rq_lock() here and obtain the other rq->l * use task_rq_lock() here and obtain the other rq->l * * * Silence PROVE_RCU * Silence PROVE_RCU */ */ rcu_read_lock(); rcu_read_lock(); __set_task_cpu(idle, cpu); __set_task_cpu(idle, cpu); rcu_read_unlock(); rcu_read_unlock(); rq->idle = idle; rq->idle = idle; rcu_assign_pointer(rq->curr, idle); rcu_assign_pointer(rq->curr, idle); idle->on_rq = TASK_ON_RQ_QUEUED; idle->on_rq = TASK_ON_RQ_QUEUED; #ifdef CONFIG_SMP #ifdef CONFIG_SMP idle->on_cpu = 1; idle->on_cpu = 1; #endif #endif raw_spin_rq_unlock(rq); raw_spin_rq_unlock(rq); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); init_idle_preempt_count(idle, cpu); /* /* * The idle tasks have their own, simple scheduling c * The idle tasks have their own, simple scheduling c */ */ idle->sched_class = &idle_sched_class; idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); ftrace_graph_init_idle_task(idle, cpu); vtime_init_idle(idle, cpu); vtime_init_idle(idle, cpu); #ifdef CONFIG_SMP #ifdef CONFIG_SMP sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif #endif } } #ifdef CONFIG_SMP #ifdef CONFIG_SMP int cpuset_cpumask_can_shrink(const struct cpumask *cur, int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial) const struct cpumask *trial) { { int ret = 1; int ret = 1; if (cpumask_empty(cur)) if (cpumask_empty(cur)) return ret; return ret; ret = dl_cpuset_cpumask_can_shrink(cur, trial); ret = dl_cpuset_cpumask_can_shrink(cur, trial); return ret; return ret; } } int task_can_attach(struct task_struct *p) int task_can_attach(struct task_struct *p) { { int ret = 0; int ret = 0; /* /* * Kthreads which disallow setaffinity shouldn't be m * Kthreads which disallow setaffinity shouldn't be m * to a new cpuset; we don't want to change their CPU * to a new cpuset; we don't want to change their CPU * affinity and isolating such threads by their set o * affinity and isolating such threads by their set o * allowed nodes is unnecessary. Thus, cpusets are n * allowed nodes is unnecessary. Thus, cpusets are n * applicable for such threads. This prevents checki * applicable for such threads. This prevents checki * success of set_cpus_allowed_ptr() on all attached * success of set_cpus_allowed_ptr() on all attached * before cpus_mask may be changed. * before cpus_mask may be changed. */ */ if (p->flags & PF_NO_SETAFFINITY) if (p->flags & PF_NO_SETAFFINITY) ret = -EINVAL; ret = -EINVAL; return ret; return ret; } } bool sched_smp_initialized __read_mostly; bool sched_smp_initialized __read_mostly; #ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING /* Migrate current task p to target_cpu */ /* Migrate current task p to target_cpu */ int migrate_task_to(struct task_struct *p, int target_cpu) int migrate_task_to(struct task_struct *p, int target_cpu) { { struct migration_arg arg = { p, target_cpu }; struct migration_arg arg = { p, target_cpu }; int curr_cpu = task_cpu(p); int curr_cpu = task_cpu(p); if (curr_cpu == target_cpu) if (curr_cpu == target_cpu) return 0; return 0; if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) return -EINVAL; return -EINVAL; /* TODO: This is not properly updating schedstats */ /* TODO: This is not properly updating schedstats */ trace_sched_move_numa(p, curr_cpu, target_cpu); trace_sched_move_numa(p, curr_cpu, target_cpu); return stop_one_cpu(curr_cpu, migration_cpu_stop, &ar return stop_one_cpu(curr_cpu, migration_cpu_stop, &ar } } /* /* * Requeue a task on a given node and accurately track the nu * Requeue a task on a given node and accurately track the nu * tasks on the runqueues * tasks on the runqueues */ */ void sched_setnuma(struct task_struct *p, int nid) void sched_setnuma(struct task_struct *p, int nid) { { bool queued, running; bool queued, running; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; rq = task_rq_lock(p, &rf); rq = task_rq_lock(p, &rf); queued = task_on_rq_queued(p); queued = task_on_rq_queued(p); running = task_current(rq, p); running = task_current(rq, p); if (queued) if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); dequeue_task(rq, p, DEQUEUE_SAVE); if (running) if (running) put_prev_task(rq, p); put_prev_task(rq, p); p->numa_preferred_nid = nid; p->numa_preferred_nid = nid; if (queued) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE if (running) if (running) set_next_task(rq, p); set_next_task(rq, p); task_rq_unlock(rq, p, &rf); task_rq_unlock(rq, p, &rf); } } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU /* /* * Ensure that the idle task is using init_mm right before it * Ensure that the idle task is using init_mm right before it * offline. * offline. */ */ void idle_task_exit(void) void idle_task_exit(void) { { struct mm_struct *mm = current->active_mm; struct mm_struct *mm = current->active_mm; BUG_ON(cpu_online(smp_processor_id())); BUG_ON(cpu_online(smp_processor_id())); BUG_ON(current != this_rq()->idle); BUG_ON(current != this_rq()->idle); if (mm != &init_mm) { if (mm != &init_mm) { switch_mm(mm, &init_mm, current); switch_mm(mm, &init_mm, current); finish_arch_post_lock_switch(); finish_arch_post_lock_switch(); } } /* finish_cpu(), as ran on the BP, will clean up the /* finish_cpu(), as ran on the BP, will clean up the } } static int __balance_push_cpu_stop(void *arg) static int __balance_push_cpu_stop(void *arg) { { struct task_struct *p = arg; struct task_struct *p = arg; struct rq *rq = this_rq(); struct rq *rq = this_rq(); struct rq_flags rf; struct rq_flags rf; int cpu; int cpu; raw_spin_lock_irq(&p->pi_lock); raw_spin_lock_irq(&p->pi_lock); rq_lock(rq, &rf); rq_lock(rq, &rf); update_rq_clock(rq); update_rq_clock(rq); if (task_rq(p) == rq && task_on_rq_queued(p)) { if (task_rq(p) == rq && task_on_rq_queued(p)) { cpu = select_fallback_rq(rq->cpu, p); cpu = select_fallback_rq(rq->cpu, p); rq = __migrate_task(rq, &rf, p, cpu); rq = __migrate_task(rq, &rf, p, cpu); } } rq_unlock(rq, &rf); rq_unlock(rq, &rf); raw_spin_unlock_irq(&p->pi_lock); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); put_task_struct(p); return 0; return 0; } } static DEFINE_PER_CPU(struct cpu_stop_work, push_work); static DEFINE_PER_CPU(struct cpu_stop_work, push_work); /* /* * Ensure we only run per-cpu kthreads once the CPU goes !act * Ensure we only run per-cpu kthreads once the CPU goes !act * * * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), * effective when the hotplug motion is down. * effective when the hotplug motion is down. */ */ static void balance_push(struct rq *rq) static void balance_push(struct rq *rq) { { struct task_struct *push_task = rq->curr; struct task_struct *push_task = rq->curr; lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); /* /* * Ensure the thing is persistent until balance_push_ * Ensure the thing is persistent until balance_push_ */ */ rq->balance_callback = &balance_push_callback; rq->balance_callback = &balance_push_callback; /* /* * Only active while going offline and when invoked o * Only active while going offline and when invoked o * CPU. * CPU. */ */ if (!cpu_dying(rq->cpu) || rq != this_rq()) if (!cpu_dying(rq->cpu) || rq != this_rq()) return; return; /* /* * Both the cpu-hotplug and stop task are in this cas * Both the cpu-hotplug and stop task are in this cas * required to complete the hotplug process. * required to complete the hotplug process. */ */ if (kthread_is_per_cpu(push_task) || if (kthread_is_per_cpu(push_task) || is_migration_disabled(push_task)) { is_migration_disabled(push_task)) { /* /* * If this is the idle task on the outgoing C * If this is the idle task on the outgoing C * up the hotplug control thread which might * up the hotplug control thread which might * last task to vanish. The rcuwait_active() * last task to vanish. The rcuwait_active() * accurate here because the waiter is pinned * accurate here because the waiter is pinned * and can't obviously be running in parallel * and can't obviously be running in parallel * * * On RT kernels this also has to check wheth * On RT kernels this also has to check wheth * pinned and scheduled out tasks on the runq * pinned and scheduled out tasks on the runq * need to leave the migrate disabled section * need to leave the migrate disabled section */ */ if (!rq->nr_running && !rq_has_pinned_tasks(r if (!rq->nr_running && !rq_has_pinned_tasks(r rcuwait_active(&rq->hotplug_wait)) { rcuwait_active(&rq->hotplug_wait)) { raw_spin_rq_unlock(rq); raw_spin_rq_unlock(rq); rcuwait_wake_up(&rq->hotplug_wait); rcuwait_wake_up(&rq->hotplug_wait); raw_spin_rq_lock(rq); raw_spin_rq_lock(rq); } } return; return; } } get_task_struct(push_task); get_task_struct(push_task); /* /* * Temporarily drop rq->lock such that we can wake-up * Temporarily drop rq->lock such that we can wake-up * Both preemption and IRQs are still disabled. * Both preemption and IRQs are still disabled. */ */ preempt_disable(); preempt_disable(); raw_spin_rq_unlock(rq); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, this_cpu_ptr(&push_work)); this_cpu_ptr(&push_work)); preempt_enable(); preempt_enable(); /* /* * At this point need_resched() is true and we'll tak * At this point need_resched() is true and we'll tak * schedule(). The next pick is obviously going to be * schedule(). The next pick is obviously going to be * which kthread_is_per_cpu() and will push this task * which kthread_is_per_cpu() and will push this task */ */ raw_spin_rq_lock(rq); raw_spin_rq_lock(rq); } } static void balance_push_set(int cpu, bool on) static void balance_push_set(int cpu, bool on) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct rq_flags rf; struct rq_flags rf; rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); if (on) { if (on) { WARN_ON_ONCE(rq->balance_callback); WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = &balance_push_callback rq->balance_callback = &balance_push_callback } else if (rq->balance_callback == &balance_push_call } else if (rq->balance_callback == &balance_push_call rq->balance_callback = NULL; rq->balance_callback = NULL; } } rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); } } /* /* * Invoked from a CPUs hotplug control thread after the CPU h * Invoked from a CPUs hotplug control thread after the CPU h * inactive. All tasks which are not per CPU kernel threads a * inactive. All tasks which are not per CPU kernel threads a * pushed off this CPU now via balance_push() or placed on a * pushed off this CPU now via balance_push() or placed on a * during wakeup. Wait until the CPU is quiescent. * during wakeup. Wait until the CPU is quiescent. */ */ static void balance_hotplug_wait(void) static void balance_hotplug_wait(void) { { struct rq *rq = this_rq(); struct rq *rq = this_rq(); rcuwait_wait_event(&rq->hotplug_wait, rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1 && !rq_has_pin rq->nr_running == 1 && !rq_has_pin TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE); } } #else #else static inline void balance_push(struct rq *rq) static inline void balance_push(struct rq *rq) { { } } static inline void balance_push_set(int cpu, bool on) static inline void balance_push_set(int cpu, bool on) { { } } static inline void balance_hotplug_wait(void) static inline void balance_hotplug_wait(void) { { } } #endif /* CONFIG_HOTPLUG_CPU */ #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) void set_rq_online(struct rq *rq) { { if (!rq->online) { if (!rq->online) { const struct sched_class *class; const struct sched_class *class; cpumask_set_cpu(rq->cpu, rq->rd->online); cpumask_set_cpu(rq->cpu, rq->rd->online); rq->online = 1; rq->online = 1; for_each_class(class) { for_each_class(class) { if (class->rq_online) if (class->rq_online) class->rq_online(rq); class->rq_online(rq); } } } } } } void set_rq_offline(struct rq *rq) void set_rq_offline(struct rq *rq) { { if (rq->online) { if (rq->online) { const struct sched_class *class; const struct sched_class *class; update_rq_clock(rq); update_rq_clock(rq); for_each_class(class) { for_each_class(class) { if (class->rq_offline) if (class->rq_offline) class->rq_offline(rq); class->rq_offline(rq); } } cpumask_clear_cpu(rq->cpu, rq->rd->online); cpumask_clear_cpu(rq->cpu, rq->rd->online); rq->online = 0; rq->online = 0; } } } } /* /* * used to mark begin/end of suspend/resume: * used to mark begin/end of suspend/resume: */ */ static int num_cpus_frozen; static int num_cpus_frozen; /* /* * Update cpusets according to cpu_active mask. If cpusets a * Update cpusets according to cpu_active mask. If cpusets a * disabled, cpuset_update_active_cpus() becomes a simple wra * disabled, cpuset_update_active_cpus() becomes a simple wra * around partition_sched_domains(). * around partition_sched_domains(). * * * If we come here as part of a suspend/resume, don't touch c * If we come here as part of a suspend/resume, don't touch c * want to restore it back to its original state upon resume * want to restore it back to its original state upon resume */ */ static void cpuset_cpu_active(void) static void cpuset_cpu_active(void) { { if (cpuhp_tasks_frozen) { if (cpuhp_tasks_frozen) { /* /* * num_cpus_frozen tracks how many CPUs are i * num_cpus_frozen tracks how many CPUs are i * resume sequence. As long as this is not th * resume sequence. As long as this is not th * operation in the resume sequence, just bui * operation in the resume sequence, just bui * domain, ignoring cpusets. * domain, ignoring cpusets. */ */ partition_sched_domains(1, NULL, NULL); partition_sched_domains(1, NULL, NULL); if (--num_cpus_frozen) if (--num_cpus_frozen) return; return; /* /* * This is the last CPU online operation. So * This is the last CPU online operation. So * restore the original sched domains by cons * restore the original sched domains by cons * cpuset configurations. * cpuset configurations. */ */ cpuset_force_rebuild(); cpuset_force_rebuild(); } } cpuset_update_active_cpus(); cpuset_update_active_cpus(); } } static int cpuset_cpu_inactive(unsigned int cpu) static int cpuset_cpu_inactive(unsigned int cpu) { { if (!cpuhp_tasks_frozen) { if (!cpuhp_tasks_frozen) { int ret = dl_bw_check_overflow(cpu); int ret = dl_bw_check_overflow(cpu); if (ret) if (ret) return ret; return ret; cpuset_update_active_cpus(); cpuset_update_active_cpus(); } else { } else { num_cpus_frozen++; num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); partition_sched_domains(1, NULL, NULL); } } return 0; return 0; } } int sched_cpu_activate(unsigned int cpu) int sched_cpu_activate(unsigned int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct rq_flags rf; struct rq_flags rf; /* /* * Clear the balance_push callback and prepare to sch * Clear the balance_push callback and prepare to sch * regular tasks. * regular tasks. */ */ balance_push_set(cpu, false); balance_push_set(cpu, false); #ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT /* /* * When going up, increment the number of cores with * When going up, increment the number of cores with */ */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_inc_cpuslocked(&sched_smt_prese static_branch_inc_cpuslocked(&sched_smt_prese #endif #endif set_cpu_active(cpu, true); set_cpu_active(cpu, true); if (sched_smp_initialized) { if (sched_smp_initialized) { sched_update_numa(cpu, true); sched_update_numa(cpu, true); sched_domains_numa_masks_set(cpu); sched_domains_numa_masks_set(cpu); cpuset_cpu_active(); cpuset_cpu_active(); } } /* /* * Put the rq online, if not already. This happens: * Put the rq online, if not already. This happens: * * * 1) In the early boot process, because we build the * 1) In the early boot process, because we build the * after all CPUs have been brought up. * after all CPUs have been brought up. * * * 2) At runtime, if cpuset_cpu_active() fails to reb * 2) At runtime, if cpuset_cpu_active() fails to reb * domains. * domains. */ */ rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); if (rq->rd) { if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); set_rq_online(rq); } } rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); return 0; return 0; } } int sched_cpu_deactivate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct rq_flags rf; struct rq_flags rf; int ret; int ret; /* /* * Remove CPU from nohz.idle_cpus_mask to prevent par * Remove CPU from nohz.idle_cpus_mask to prevent par * load balancing when not active * load balancing when not active */ */ nohz_balance_exit_idle(rq); nohz_balance_exit_idle(rq); set_cpu_active(cpu, false); set_cpu_active(cpu, false); /* /* * From this point forward, this CPU will refuse to r * From this point forward, this CPU will refuse to r * is not: migrate_disable() or KTHREAD_IS_PER_CPU, a * is not: migrate_disable() or KTHREAD_IS_PER_CPU, a * push those tasks away until this gets cleared, see * push those tasks away until this gets cleared, see * sched_cpu_dying(). * sched_cpu_dying(). */ */ balance_push_set(cpu, true); balance_push_set(cpu, true); /* /* * We've cleared cpu_active_mask / set balance_push, * We've cleared cpu_active_mask / set balance_push, * preempt-disabled and RCU users of this state to go * preempt-disabled and RCU users of this state to go * all new such users will observe it. * all new such users will observe it. * * * Specifically, we rely on ttwu to no longer target * Specifically, we rely on ttwu to no longer target * ttwu_queue_cond() and is_cpu_allowed(). * ttwu_queue_cond() and is_cpu_allowed(). * * * Do sync before park smpboot threads to take care t * Do sync before park smpboot threads to take care t */ */ synchronize_rcu(); synchronize_rcu(); rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); if (rq->rd) { if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); set_rq_offline(rq); } } rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); #ifdef CONFIG_SCHED_SMT #ifdef CONFIG_SCHED_SMT /* /* * When going down, decrement the number of cores wit * When going down, decrement the number of cores wit */ */ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) if (cpumask_weight(cpu_smt_mask(cpu)) == 2) static_branch_dec_cpuslocked(&sched_smt_prese static_branch_dec_cpuslocked(&sched_smt_prese sched_core_cpu_deactivate(cpu); sched_core_cpu_deactivate(cpu); #endif #endif if (!sched_smp_initialized) if (!sched_smp_initialized) return 0; return 0; sched_update_numa(cpu, false); sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); ret = cpuset_cpu_inactive(cpu); if (ret) { if (ret) { balance_push_set(cpu, false); balance_push_set(cpu, false); set_cpu_active(cpu, true); set_cpu_active(cpu, true); sched_update_numa(cpu, true); sched_update_numa(cpu, true); return ret; return ret; } } sched_domains_numa_masks_clear(cpu); sched_domains_numa_masks_clear(cpu); return 0; return 0; } } static void sched_rq_cpu_starting(unsigned int cpu) static void sched_rq_cpu_starting(unsigned int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; rq->calc_load_update = calc_load_update; update_max_interval(); update_max_interval(); } } int sched_cpu_starting(unsigned int cpu) int sched_cpu_starting(unsigned int cpu) { { sched_core_cpu_starting(cpu); sched_core_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_rq_cpu_starting(cpu); sched_tick_start(cpu); sched_tick_start(cpu); return 0; return 0; } } #ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU /* /* * Invoked immediately before the stopper thread is invoked t * Invoked immediately before the stopper thread is invoked t * CPU down completely. At this point all per CPU kthreads ex * CPU down completely. At this point all per CPU kthreads ex * hotplug thread (current) and the stopper thread (inactive) * hotplug thread (current) and the stopper thread (inactive) * either parked or have been unbound from the outgoing CPU. * either parked or have been unbound from the outgoing CPU. * any of those which might be on the way out are gone. * any of those which might be on the way out are gone. * * * If after this point a bound task is being woken on this CP * If after this point a bound task is being woken on this CP * responsible hotplug callback has failed to do it's job. * responsible hotplug callback has failed to do it's job. * sched_cpu_dying() will catch it with the appropriate firew * sched_cpu_dying() will catch it with the appropriate firew */ */ int sched_cpu_wait_empty(unsigned int cpu) int sched_cpu_wait_empty(unsigned int cpu) { { balance_hotplug_wait(); balance_hotplug_wait(); return 0; return 0; } } /* /* * Since this CPU is going 'away' for a while, fold any nr_ac * Since this CPU is going 'away' for a while, fold any nr_ac * might have. Called from the CPU stopper task after ensurin * might have. Called from the CPU stopper task after ensurin * stopper is the last running task on the CPU, so nr_active * stopper is the last running task on the CPU, so nr_active * stable. We need to take the teardown thread which is calli * stable. We need to take the teardown thread which is calli * account, so we hand in adjust = 1 to the load calculation. * account, so we hand in adjust = 1 to the load calculation. * * * Also see the comment "Global load-average calculations". * Also see the comment "Global load-average calculations". */ */ static void calc_load_migrate(struct rq *rq) static void calc_load_migrate(struct rq *rq) { { long delta = calc_load_fold_active(rq, 1); long delta = calc_load_fold_active(rq, 1); if (delta) if (delta) atomic_long_add(delta, &calc_load_tasks); atomic_long_add(delta, &calc_load_tasks); } } static void dump_rq_tasks(struct rq *rq, const char *loglvl) static void dump_rq_tasks(struct rq *rq, const char *loglvl) { { struct task_struct *g, *p; struct task_struct *g, *p; int cpu = cpu_of(rq); int cpu = cpu_of(rq); lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq); printk("%sCPU%d enqueued tasks (%u total):\n", loglvl printk("%sCPU%d enqueued tasks (%u total):\n", loglvl for_each_process_thread(g, p) { for_each_process_thread(g, p) { if (task_cpu(p) != cpu) if (task_cpu(p) != cpu) continue; continue; if (!task_on_rq_queued(p)) if (!task_on_rq_queued(p)) continue; continue; printk("%s\tpid: %d, name: %s\n", loglvl, p-> printk("%s\tpid: %d, name: %s\n", loglvl, p-> } } } } int sched_cpu_dying(unsigned int cpu) int sched_cpu_dying(unsigned int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct rq_flags rf; struct rq_flags rf; /* Handle pending wakeups and then migrate everything /* Handle pending wakeups and then migrate everything sched_tick_stop(cpu); sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { WARN(true, "Dying CPU not properly vacated!") WARN(true, "Dying CPU not properly vacated!") dump_rq_tasks(rq, KERN_WARNING); dump_rq_tasks(rq, KERN_WARNING); } } rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); calc_load_migrate(rq); update_max_interval(); update_max_interval(); hrtick_clear(rq); hrtick_clear(rq); sched_core_cpu_dying(cpu); sched_core_cpu_dying(cpu); return 0; return 0; } } #endif #endif void __init sched_init_smp(void) void __init sched_init_smp(void) { { sched_init_numa(NUMA_NO_NODE); sched_init_numa(NUMA_NO_NODE); /* /* * There's no userspace yet to cause hotplug operatio * There's no userspace yet to cause hotplug operatio * CPU masks are stable and all blatant races in the * CPU masks are stable and all blatant races in the * happen. * happen. */ */ mutex_lock(&sched_domains_mutex); mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumas if (set_cpus_allowed_ptr(current, housekeeping_cpumas BUG(); BUG(); current->flags &= ~PF_NO_SETAFFINITY; current->flags &= ~PF_NO_SETAFFINITY; sched_init_granularity(); sched_init_granularity(); init_sched_rt_class(); init_sched_rt_class(); init_sched_dl_class(); init_sched_dl_class(); sched_smp_initialized = true; sched_smp_initialized = true; } } static int __init migration_init(void) static int __init migration_init(void) { { sched_cpu_starting(smp_processor_id()); sched_cpu_starting(smp_processor_id()); return 0; return 0; } } early_initcall(migration_init); early_initcall(migration_init); #else #else void __init sched_init_smp(void) void __init sched_init_smp(void) { { sched_init_granularity(); sched_init_granularity(); } } #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ int in_sched_functions(unsigned long addr) int in_sched_functions(unsigned long addr) { { return in_lock_functions(addr) || return in_lock_functions(addr) || (addr >= (unsigned long)__sched_text_start (addr >= (unsigned long)__sched_text_start && addr < (unsigned long)__sched_text_end); && addr < (unsigned long)__sched_text_end); } } #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED /* /* * Default task group. * Default task group. * Every task in system belongs to this group at bootup. * Every task in system belongs to this group at bootup. */ */ struct task_group root_task_group; struct task_group root_task_group; LIST_HEAD(task_groups); LIST_HEAD(task_groups); /* Cacheline aligned slab cache for task_group */ /* Cacheline aligned slab cache for task_group */ static struct kmem_cache *task_group_cache __read_mostly; static struct kmem_cache *task_group_cache __read_mostly; #endif #endif void __init sched_init(void) void __init sched_init(void) { { unsigned long ptr = 0; unsigned long ptr = 0; int i; int i; /* Make sure the linker didn't screw up */ /* Make sure the linker didn't screw up */ BUG_ON(&idle_sched_class != &fair_sched_class + 1 || BUG_ON(&idle_sched_class != &fair_sched_class + 1 || &fair_sched_class != &rt_sched_class + 1 || &fair_sched_class != &rt_sched_class + 1 || &rt_sched_class != &dl_sched_class + 1); &rt_sched_class != &dl_sched_class + 1); #ifdef CONFIG_SMP #ifdef CONFIG_SMP BUG_ON(&dl_sched_class != &stop_sched_class + 1); BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif #endif wait_bit_init(); wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); ptr += 2 * nr_cpu_ids * sizeof(void **); #endif #endif #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); ptr += 2 * nr_cpu_ids * sizeof(void **); #endif #endif if (ptr) { if (ptr) { ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT) ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT) #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.se = (struct sched_entity **) root_task_group.se = (struct sched_entity **) ptr += nr_cpu_ids * sizeof(void **); ptr += nr_cpu_ids * sizeof(void **); root_task_group.cfs_rq = (struct cfs_rq **)pt root_task_group.cfs_rq = (struct cfs_rq **)pt ptr += nr_cpu_ids * sizeof(void **); ptr += nr_cpu_ids * sizeof(void **); root_task_group.shares = ROOT_TASK_GROUP_LOAD root_task_group.shares = ROOT_TASK_GROUP_LOAD init_cfs_bandwidth(&root_task_group.cfs_bandw init_cfs_bandwidth(&root_task_group.cfs_bandw #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED root_task_group.rt_se = (struct sched_rt_enti root_task_group.rt_se = (struct sched_rt_enti ptr += nr_cpu_ids * sizeof(void **); ptr += nr_cpu_ids * sizeof(void **); root_task_group.rt_rq = (struct rt_rq **)ptr; root_task_group.rt_rq = (struct rt_rq **)ptr; ptr += nr_cpu_ids * sizeof(void **); ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ } } init_rt_bandwidth(&def_rt_bandwidth, global_rt_period init_rt_bandwidth(&def_rt_bandwidth, global_rt_period #ifdef CONFIG_SMP #ifdef CONFIG_SMP init_defrootdomain(); init_defrootdomain(); #endif #endif #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED init_rt_bandwidth(&root_task_group.rt_bandwidth, init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(), global_rt_runtime global_rt_period(), global_rt_runtime #endif /* CONFIG_RT_GROUP_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED task_group_cache = KMEM_CACHE(task_group, 0); task_group_cache = KMEM_CACHE(task_group, 0); list_add(&root_task_group.list, &task_groups); list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); INIT_LIST_HEAD(&root_task_group.children); INIT_LIST_HEAD(&root_task_group.siblings); INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { for_each_possible_cpu(i) { struct rq *rq; struct rq *rq; rq = cpu_rq(i); rq = cpu_rq(i); raw_spin_lock_init(&rq->__lock); raw_spin_lock_init(&rq->__lock); rq->nr_running = 0; rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt); init_rt_rq(&rq->rt); init_dl_rq(&rq->dl); init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; /* /* * How much CPU bandwidth does root_task_grou * How much CPU bandwidth does root_task_grou * * * In case of task-groups formed thr' the cgr * In case of task-groups formed thr' the cgr * gets 100% of the CPU resources in the syst * gets 100% of the CPU resources in the syst * system CPU resource is divided among the t * system CPU resource is divided among the t * root_task_group and its child task-groups * root_task_group and its child task-groups * based on each entity's (task or task-group * based on each entity's (task or task-group * (se->load.weight). * (se->load.weight). * * * In other words, if root_task_group has 10 * In other words, if root_task_group has 10 * 1024) and two child groups A0 and A1 (of w * 1024) and two child groups A0 and A1 (of w * then A0's share of the CPU resource is: * then A0's share of the CPU resource is: * * * A0's bandwidth = 1024 / (10*1024 + 10 * A0's bandwidth = 1024 / (10*1024 + 10 * * * We achieve this by letting root_task_group * We achieve this by letting root_task_group * directly in rq->cfs (i.e root_task_group-> * directly in rq->cfs (i.e root_task_group-> */ */ init_tg_cfs_entry(&root_task_group, &rq->cfs, init_tg_cfs_entry(&root_task_group, &rq->cfs, #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */ rq->rt.rt_runtime = def_rt_bandwidth.rt_runti rq->rt.rt_runtime = def_rt_bandwidth.rt_runti #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED init_tg_rt_entry(&root_task_group, &rq->rt, N init_tg_rt_entry(&root_task_group, &rq->rt, N #endif #endif #ifdef CONFIG_SMP #ifdef CONFIG_SMP rq->sd = NULL; rq->sd = NULL; rq->rd = NULL; rq->rd = NULL; rq->cpu_capacity = rq->cpu_capacity_orig = SC rq->cpu_capacity = rq->cpu_capacity_orig = SC rq->balance_callback = &balance_push_callback rq->balance_callback = &balance_push_callback rq->active_balance = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->next_balance = jiffies; rq->push_cpu = 0; rq->push_cpu = 0; rq->cpu = i; rq->cpu = i; rq->online = 0; rq->online = 0; rq->idle_stamp = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->wake_stamp = jiffies; rq->wake_stamp = jiffies; rq->wake_avg_idle = rq->avg_idle; rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migr rq->max_idle_balance_cost = sysctl_sched_migr INIT_LIST_HEAD(&rq->cfs_tasks); INIT_LIST_HEAD(&rq->cfs_tasks); rq_attach_root(rq, &def_root_domain); rq_attach_root(rq, &def_root_domain); #ifdef CONFIG_NO_HZ_COMMON #ifdef CONFIG_NO_HZ_COMMON rq->last_blocked_load_update_tick = jiffies; rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); atomic_set(&rq->nohz_flags, 0); INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); #endif #endif #ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); rcuwait_init(&rq->hotplug_wait); #endif #endif #endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */ hrtick_rq_init(rq); hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); atomic_set(&rq->nr_iowait, 0); #ifdef CONFIG_SCHED_CORE #ifdef CONFIG_SCHED_CORE rq->core = rq; rq->core = rq; rq->core_pick = NULL; rq->core_pick = NULL; rq->core_enabled = 0; rq->core_enabled = 0; rq->core_tree = RB_ROOT; rq->core_tree = RB_ROOT; rq->core_forceidle_count = 0; rq->core_forceidle_count = 0; rq->core_forceidle_occupation = 0; rq->core_forceidle_occupation = 0; rq->core_forceidle_start = 0; rq->core_forceidle_start = 0; rq->core_cookie = 0UL; rq->core_cookie = 0UL; #endif #endif zalloc_cpumask_var_node(&rq->scratch_mask, GF zalloc_cpumask_var_node(&rq->scratch_mask, GF } } set_load_weight(&init_task, false); set_load_weight(&init_task, false); /* /* * The boot idle thread does lazy MMU switching as we * The boot idle thread does lazy MMU switching as we */ */ mmgrab_lazy_tlb(&init_mm); mmgrab_lazy_tlb(&init_mm); enter_lazy_tlb(&init_mm, current); enter_lazy_tlb(&init_mm, current); /* /* * The idle task doesn't need the kthread struct to f * The idle task doesn't need the kthread struct to f * is dressed up as a per-CPU kthread and thus needs * is dressed up as a per-CPU kthread and thus needs * if we want to avoid special-casing it in code that * if we want to avoid special-casing it in code that * kthreads. * kthreads. */ */ WARN_ON(!set_kthread_struct(current)); WARN_ON(!set_kthread_struct(current)); /* /* * Make us the idle thread. Technically, schedule() s * Make us the idle thread. Technically, schedule() s * called from this thread, however somewhere below i * called from this thread, however somewhere below i * but because we are the idle thread, we just pick u * but because we are the idle thread, we just pick u * when this runqueue becomes "idle". * when this runqueue becomes "idle". */ */ init_idle(current, smp_processor_id()); init_idle(current, smp_processor_id()); calc_load_update = jiffies + LOAD_FREQ; calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP #ifdef CONFIG_SMP idle_thread_set_boot_cpu(); idle_thread_set_boot_cpu(); balance_push_set(smp_processor_id(), false); balance_push_set(smp_processor_id(), false); #endif #endif init_sched_fair_class(); init_sched_fair_class(); psi_init(); psi_init(); init_uclamp(); init_uclamp(); preempt_dynamic_init(); preempt_dynamic_init(); scheduler_running = 1; scheduler_running = 1; } } #ifdef CONFIG_DEBUG_ATOMIC_SLEEP #ifdef CONFIG_DEBUG_ATOMIC_SLEEP void __might_sleep(const char *file, int line) void __might_sleep(const char *file, int line) { { unsigned int state = get_current_state(); unsigned int state = get_current_state(); /* /* * Blocking primitives will set (and therefore destro * Blocking primitives will set (and therefore destro * since we will exit with TASK_RUNNING make sure we * since we will exit with TASK_RUNNING make sure we * otherwise we will destroy state. * otherwise we will destroy state. */ */ WARN_ONCE(state != TASK_RUNNING && current->task_stat WARN_ONCE(state != TASK_RUNNING && current->task_stat "do not call blocking ops when !TASK_ "do not call blocking ops when !TASK_ "state=%x set at [<%p>] %pS\n", state "state=%x set at [<%p>] %pS\n", state (void *)current->task_state_change, (void *)current->task_state_change, (void *)current->task_state_change); (void *)current->task_state_change); __might_resched(file, line, 0); __might_resched(file, line, 0); } } EXPORT_SYMBOL(__might_sleep); EXPORT_SYMBOL(__might_sleep); static void print_preempt_disable_ip(int preempt_offset, unsi static void print_preempt_disable_ip(int preempt_offset, unsi { { if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) return; return; if (preempt_count() == preempt_offset) if (preempt_count() == preempt_offset) return; return; pr_err("Preemption disabled at:"); pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, ip); print_ip_sym(KERN_ERR, ip); } } static inline bool resched_offsets_ok(unsigned int offsets) static inline bool resched_offsets_ok(unsigned int offsets) { { unsigned int nested = preempt_count(); unsigned int nested = preempt_count(); nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SH nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SH return nested == offsets; return nested == offsets; } } void __might_resched(const char *file, int line, unsigned int void __might_resched(const char *file, int line, unsigned int { { /* Ratelimiting timestamp: */ /* Ratelimiting timestamp: */ static unsigned long prev_jiffy; static unsigned long prev_jiffy; unsigned long preempt_disable_ip; unsigned long preempt_disable_ip; /* WARN_ON_ONCE() by default, no rate limit required: /* WARN_ON_ONCE() by default, no rate limit required: rcu_sleep_check(); rcu_sleep_check(); if ((resched_offsets_ok(offsets) && !irqs_disabled() if ((resched_offsets_ok(offsets) && !irqs_disabled() !is_idle_task(current) && !current->non_block_co !is_idle_task(current) && !current->non_block_co system_state == SYSTEM_BOOTING || system_state > system_state == SYSTEM_BOOTING || system_state > oops_in_progress) oops_in_progress) return; return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jif if (time_before(jiffies, prev_jiffy + HZ) && prev_jif return; return; prev_jiffy = jiffies; prev_jiffy = jiffies; /* Save this before calling printk(), since that will /* Save this before calling printk(), since that will preempt_disable_ip = get_preempt_disable_ip(current); preempt_disable_ip = get_preempt_disable_ip(current); pr_err("BUG: sleeping function called from invalid co pr_err("BUG: sleeping function called from invalid co file, line); file, line); pr_err("in_atomic(): %d, irqs_disabled(): %d, non_blo pr_err("in_atomic(): %d, irqs_disabled(): %d, non_blo in_atomic(), irqs_disabled(), current->non_blo in_atomic(), irqs_disabled(), current->non_blo current->pid, current->comm); current->pid, current->comm); pr_err("preempt_count: %x, expected: %x\n", preempt_c pr_err("preempt_count: %x, expected: %x\n", preempt_c offsets & MIGHT_RESCHED_PREEMPT_MASK); offsets & MIGHT_RESCHED_PREEMPT_MASK); if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { pr_err("RCU nest depth: %d, expected: %u\n", pr_err("RCU nest depth: %d, expected: %u\n", rcu_preempt_depth(), offsets >> MIGHT_ rcu_preempt_depth(), offsets >> MIGHT_ } } if (task_stack_end_corrupted(current)) if (task_stack_end_corrupted(current)) pr_emerg("Thread overran stack, or stack corr pr_emerg("Thread overran stack, or stack corr debug_show_held_locks(current); debug_show_held_locks(current); if (irqs_disabled()) if (irqs_disabled()) print_irqtrace_events(current); print_irqtrace_events(current); print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREE print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREE preempt_disable_ip); preempt_disable_ip); dump_stack(); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } } EXPORT_SYMBOL(__might_resched); EXPORT_SYMBOL(__might_resched); void __cant_sleep(const char *file, int line, int preempt_off void __cant_sleep(const char *file, int line, int preempt_off { { static unsigned long prev_jiffy; static unsigned long prev_jiffy; if (irqs_disabled()) if (irqs_disabled()) return; return; if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) return; return; if (preempt_count() > preempt_offset) if (preempt_count() > preempt_offset) return; return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jif if (time_before(jiffies, prev_jiffy + HZ) && prev_jif return; return; prev_jiffy = jiffies; prev_jiffy = jiffies; printk(KERN_ERR "BUG: assuming atomic context at %s:% printk(KERN_ERR "BUG: assuming atomic context at %s:% printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d in_atomic(), irqs_disabled(), in_atomic(), irqs_disabled(), current->pid, current->comm); current->pid, current->comm); debug_show_held_locks(current); debug_show_held_locks(current); dump_stack(); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } } EXPORT_SYMBOL_GPL(__cant_sleep); EXPORT_SYMBOL_GPL(__cant_sleep); #ifdef CONFIG_SMP #ifdef CONFIG_SMP void __cant_migrate(const char *file, int line) void __cant_migrate(const char *file, int line) { { static unsigned long prev_jiffy; static unsigned long prev_jiffy; if (irqs_disabled()) if (irqs_disabled()) return; return; if (is_migration_disabled(current)) if (is_migration_disabled(current)) return; return; if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) return; return; if (preempt_count() > 0) if (preempt_count() > 0) return; return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jif if (time_before(jiffies, prev_jiffy + HZ) && prev_jif return; return; prev_jiffy = jiffies; prev_jiffy = jiffies; pr_err("BUG: assuming non migratable context at %s:%d pr_err("BUG: assuming non migratable context at %s:%d pr_err("in_atomic(): %d, irqs_disabled(): %d, migrati pr_err("in_atomic(): %d, irqs_disabled(): %d, migrati in_atomic(), irqs_disabled(), is_migration_dis in_atomic(), irqs_disabled(), is_migration_dis current->pid, current->comm); current->pid, current->comm); debug_show_held_locks(current); debug_show_held_locks(current); dump_stack(); dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } } EXPORT_SYMBOL_GPL(__cant_migrate); EXPORT_SYMBOL_GPL(__cant_migrate); #endif #endif #endif #endif #ifdef CONFIG_MAGIC_SYSRQ #ifdef CONFIG_MAGIC_SYSRQ void normalize_rt_tasks(void) void normalize_rt_tasks(void) { { struct task_struct *g, *p; struct task_struct *g, *p; struct sched_attr attr = { struct sched_attr attr = { .sched_policy = SCHED_NORMAL, .sched_policy = SCHED_NORMAL, }; }; read_lock(&tasklist_lock); read_lock(&tasklist_lock); for_each_process_thread(g, p) { for_each_process_thread(g, p) { /* /* * Only normalize user tasks: * Only normalize user tasks: */ */ if (p->flags & PF_KTHREAD) if (p->flags & PF_KTHREAD) continue; continue; p->se.exec_start = 0; p->se.exec_start = 0; schedstat_set(p->stats.wait_start, 0); schedstat_set(p->stats.wait_start, 0); schedstat_set(p->stats.sleep_start, 0); schedstat_set(p->stats.sleep_start, 0); schedstat_set(p->stats.block_start, 0); schedstat_set(p->stats.block_start, 0); if (!dl_task(p) && !rt_task(p)) { if (!dl_task(p) && !rt_task(p)) { /* /* * Renice negative nice level userspa * Renice negative nice level userspa * tasks back to 0: * tasks back to 0: */ */ if (task_nice(p) < 0) if (task_nice(p) < 0) set_user_nice(p, 0); set_user_nice(p, 0); continue; continue; } } __sched_setscheduler(p, &attr, false, false); __sched_setscheduler(p, &attr, false, false); } } read_unlock(&tasklist_lock); read_unlock(&tasklist_lock); } } #endif /* CONFIG_MAGIC_SYSRQ */ #endif /* CONFIG_MAGIC_SYSRQ */ #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) /* /* * These functions are only useful for the IA64 MCA handling, * These functions are only useful for the IA64 MCA handling, * * * They can only be called when the whole system has been * They can only be called when the whole system has been * stopped - every CPU needs to be quiescent, and no scheduli * stopped - every CPU needs to be quiescent, and no scheduli * activity can take place. Using them for anything else woul * activity can take place. Using them for anything else woul * be a serious bug, and as a result, they aren't even visibl * be a serious bug, and as a result, they aren't even visibl * under any other configuration. * under any other configuration. */ */ /** /** * curr_task - return the current task for a given CPU. * curr_task - return the current task for a given CPU. * @cpu: the processor in question. * @cpu: the processor in question. * * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! * * * Return: The current task for @cpu. * Return: The current task for @cpu. */ */ struct task_struct *curr_task(int cpu) struct task_struct *curr_task(int cpu) { { return cpu_curr(cpu); return cpu_curr(cpu); } } #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ #ifdef CONFIG_IA64 #ifdef CONFIG_IA64 /** /** * ia64_set_curr_task - set the current task for a given CPU. * ia64_set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @cpu: the processor in question. * @p: the task pointer to set. * @p: the task pointer to set. * * * Description: This function must only be used when non-mask * Description: This function must only be used when non-mask * are serviced on a separate stack. It allows the architectu * are serviced on a separate stack. It allows the architectu * notion of the current task on a CPU in a non-blocking mann * notion of the current task on a CPU in a non-blocking mann * must be called with all CPU's synchronized, and interrupts * must be called with all CPU's synchronized, and interrupts * and caller must save the original value of the current tas * and caller must save the original value of the current tas * curr_task() above) and restore that value before reenablin * curr_task() above) and restore that value before reenablin * re-starting the system. * re-starting the system. * * * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! */ */ void ia64_set_curr_task(int cpu, struct task_struct *p) void ia64_set_curr_task(int cpu, struct task_struct *p) { { cpu_curr(cpu) = p; cpu_curr(cpu) = p; } } #endif #endif #ifdef CONFIG_CGROUP_SCHED #ifdef CONFIG_CGROUP_SCHED /* task_group_lock serializes the addition/removal of task gr /* task_group_lock serializes the addition/removal of task gr static DEFINE_SPINLOCK(task_group_lock); static DEFINE_SPINLOCK(task_group_lock); static inline void alloc_uclamp_sched_group(struct task_group static inline void alloc_uclamp_sched_group(struct task_group struct task_group struct task_group { { #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP enum uclamp_id clamp_id; enum uclamp_id clamp_id; for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { uclamp_se_set(&tg->uclamp_req[clamp_id], uclamp_se_set(&tg->uclamp_req[clamp_id], uclamp_none(clamp_id), false); uclamp_none(clamp_id), false); tg->uclamp[clamp_id] = parent->uclamp[clamp_i tg->uclamp[clamp_id] = parent->uclamp[clamp_i } } #endif #endif } } static void sched_free_group(struct task_group *tg) static void sched_free_group(struct task_group *tg) { { free_fair_sched_group(tg); free_fair_sched_group(tg); free_rt_sched_group(tg); free_rt_sched_group(tg); autogroup_free(tg); autogroup_free(tg); kmem_cache_free(task_group_cache, tg); kmem_cache_free(task_group_cache, tg); } } static void sched_free_group_rcu(struct rcu_head *rcu) static void sched_free_group_rcu(struct rcu_head *rcu) { { sched_free_group(container_of(rcu, struct task_group, sched_free_group(container_of(rcu, struct task_group, } } static void sched_unregister_group(struct task_group *tg) static void sched_unregister_group(struct task_group *tg) { { unregister_fair_sched_group(tg); unregister_fair_sched_group(tg); unregister_rt_sched_group(tg); unregister_rt_sched_group(tg); /* /* * We have to wait for yet another RCU grace period t * We have to wait for yet another RCU grace period t * print_cfs_stats() might run concurrently. * print_cfs_stats() might run concurrently. */ */ call_rcu(&tg->rcu, sched_free_group_rcu); call_rcu(&tg->rcu, sched_free_group_rcu); } } /* allocate runqueue etc for a new task group */ /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(struct task_group *pare struct task_group *sched_create_group(struct task_group *pare { { struct task_group *tg; struct task_group *tg; tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | if (!tg) if (!tg) return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM); if (!alloc_fair_sched_group(tg, parent)) if (!alloc_fair_sched_group(tg, parent)) goto err; goto err; if (!alloc_rt_sched_group(tg, parent)) if (!alloc_rt_sched_group(tg, parent)) goto err; goto err; alloc_uclamp_sched_group(tg, parent); alloc_uclamp_sched_group(tg, parent); return tg; return tg; err: err: sched_free_group(tg); sched_free_group(tg); return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM); } } void sched_online_group(struct task_group *tg, struct task_gr void sched_online_group(struct task_group *tg, struct task_gr { { unsigned long flags; unsigned long flags; spin_lock_irqsave(&task_group_lock, flags); spin_lock_irqsave(&task_group_lock, flags); list_add_rcu(&tg->list, &task_groups); list_add_rcu(&tg->list, &task_groups); /* Root should already exist: */ /* Root should already exist: */ WARN_ON(!parent); WARN_ON(!parent); tg->parent = parent; tg->parent = parent; INIT_LIST_HEAD(&tg->children); INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); spin_unlock_irqrestore(&task_group_lock, flags); online_fair_sched_group(tg); online_fair_sched_group(tg); } } /* rcu callback to free various structures associated with a /* rcu callback to free various structures associated with a static void sched_unregister_group_rcu(struct rcu_head *rhp) static void sched_unregister_group_rcu(struct rcu_head *rhp) { { /* Now it should be safe to free those cfs_rqs: */ /* Now it should be safe to free those cfs_rqs: */ sched_unregister_group(container_of(rhp, struct task_ sched_unregister_group(container_of(rhp, struct task_ } } void sched_destroy_group(struct task_group *tg) void sched_destroy_group(struct task_group *tg) { { /* Wait for possible concurrent references to cfs_rqs /* Wait for possible concurrent references to cfs_rqs call_rcu(&tg->rcu, sched_unregister_group_rcu); call_rcu(&tg->rcu, sched_unregister_group_rcu); } } void sched_release_group(struct task_group *tg) void sched_release_group(struct task_group *tg) { { unsigned long flags; unsigned long flags; /* /* * Unlink first, to avoid walk_tg_tree_from() from fi * Unlink first, to avoid walk_tg_tree_from() from fi * sched_cfs_period_timer()). * sched_cfs_period_timer()). * * * For this to be effective, we have to wait for all * For this to be effective, we have to wait for all * this task group to leave their RCU critical sectio * this task group to leave their RCU critical sectio * user will see our dying task group any more. Speci * user will see our dying task group any more. Speci * that tg_unthrottle_up() won't add decayed cfs_rq's * that tg_unthrottle_up() won't add decayed cfs_rq's * * * We therefore defer calling unregister_fair_sched_g * We therefore defer calling unregister_fair_sched_g * sched_unregister_group() which is guarantied to ge * sched_unregister_group() which is guarantied to ge * current RCU grace period has expired. * current RCU grace period has expired. */ */ spin_lock_irqsave(&task_group_lock, flags); spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); list_del_rcu(&tg->list); list_del_rcu(&tg->siblings); list_del_rcu(&tg->siblings); spin_unlock_irqrestore(&task_group_lock, flags); spin_unlock_irqrestore(&task_group_lock, flags); } } static struct task_group *sched_get_task_group(struct task_st static struct task_group *sched_get_task_group(struct task_st { { struct task_group *tg; struct task_group *tg; /* /* * All callers are synchronized by task_rq_lock(); we * All callers are synchronized by task_rq_lock(); we * which is pointless here. Thus, we pass "true" to t * which is pointless here. Thus, we pass "true" to t * to prevent lockdep warnings. * to prevent lockdep warnings. */ */ tg = container_of(task_css_check(tsk, cpu_cgrp_id, tr tg = container_of(task_css_check(tsk, cpu_cgrp_id, tr struct task_group, css); struct task_group, css); tg = autogroup_task_group(tsk, tg); tg = autogroup_task_group(tsk, tg); return tg; return tg; } } static void sched_change_group(struct task_struct *tsk, struc static void sched_change_group(struct task_struct *tsk, struc { { tsk->sched_task_group = group; tsk->sched_task_group = group; #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED if (tsk->sched_class->task_change_group) if (tsk->sched_class->task_change_group) tsk->sched_class->task_change_group(tsk); tsk->sched_class->task_change_group(tsk); else else #endif #endif set_task_rq(tsk, task_cpu(tsk)); set_task_rq(tsk, task_cpu(tsk)); } } /* /* * Change task's runqueue when it moves between groups. * Change task's runqueue when it moves between groups. * * * The caller of this function should have put the task in it * The caller of this function should have put the task in it * now. This function just updates tsk->se.cfs_rq and tsk->se * now. This function just updates tsk->se.cfs_rq and tsk->se * its new group. * its new group. */ */ void sched_move_task(struct task_struct *tsk) void sched_move_task(struct task_struct *tsk) { { int queued, running, queue_flags = int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK struct task_group *group; struct task_group *group; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; rq = task_rq_lock(tsk, &rf); rq = task_rq_lock(tsk, &rf); /* /* * Esp. with SCHED_AUTOGROUP enabled it is possible t * Esp. with SCHED_AUTOGROUP enabled it is possible t * group changes. * group changes. */ */ group = sched_get_task_group(tsk); group = sched_get_task_group(tsk); if (group == tsk->sched_task_group) if (group == tsk->sched_task_group) goto unlock; goto unlock; update_rq_clock(rq); update_rq_clock(rq); running = task_current(rq, tsk); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); queued = task_on_rq_queued(tsk); if (queued) if (queued) dequeue_task(rq, tsk, queue_flags); dequeue_task(rq, tsk, queue_flags); if (running) if (running) put_prev_task(rq, tsk); put_prev_task(rq, tsk); sched_change_group(tsk, group); sched_change_group(tsk, group); if (queued) if (queued) enqueue_task(rq, tsk, queue_flags); enqueue_task(rq, tsk, queue_flags); if (running) { if (running) { set_next_task(rq, tsk); set_next_task(rq, tsk); /* /* * After changing group, the running task may * After changing group, the running task may * throttled one but it's still the running t * throttled one but it's still the running t * resched to make sure that task can still r * resched to make sure that task can still r */ */ resched_curr(rq); resched_curr(rq); } } unlock: unlock: task_rq_unlock(rq, tsk, &rf); task_rq_unlock(rq, tsk, &rf); } } static inline struct task_group *css_tg(struct cgroup_subsys_ static inline struct task_group *css_tg(struct cgroup_subsys_ { { return css ? container_of(css, struct task_group, css return css ? container_of(css, struct task_group, css } } static struct cgroup_subsys_state * static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { { struct task_group *parent = css_tg(parent_css); struct task_group *parent = css_tg(parent_css); struct task_group *tg; struct task_group *tg; if (!parent) { if (!parent) { /* This is early initialization for the top c /* This is early initialization for the top c return &root_task_group.css; return &root_task_group.css; } } tg = sched_create_group(parent); tg = sched_create_group(parent); if (IS_ERR(tg)) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM); return &tg->css; return &tg->css; } } /* Expose task group only after completing cgroup initializat /* Expose task group only after completing cgroup initializat static int cpu_cgroup_css_online(struct cgroup_subsys_state * static int cpu_cgroup_css_online(struct cgroup_subsys_state * { { struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css); struct task_group *parent = css_tg(css->parent); struct task_group *parent = css_tg(css->parent); if (parent) if (parent) sched_online_group(tg, parent); sched_online_group(tg, parent); #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new g /* Propagate the effective uclamp value for the new g mutex_lock(&uclamp_mutex); mutex_lock(&uclamp_mutex); rcu_read_lock(); rcu_read_lock(); cpu_util_update_eff(css); cpu_util_update_eff(css); rcu_read_unlock(); rcu_read_unlock(); mutex_unlock(&uclamp_mutex); mutex_unlock(&uclamp_mutex); #endif #endif return 0; return 0; } } static void cpu_cgroup_css_released(struct cgroup_subsys_stat static void cpu_cgroup_css_released(struct cgroup_subsys_stat { { struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css); sched_release_group(tg); sched_release_group(tg); } } static void cpu_cgroup_css_free(struct cgroup_subsys_state *c static void cpu_cgroup_css_free(struct cgroup_subsys_state *c { { struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css); /* /* * Relies on the RCU grace period between css_release * Relies on the RCU grace period between css_release */ */ sched_unregister_group(tg); sched_unregister_group(tg); } } #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { { struct task_struct *task; struct task_struct *task; struct cgroup_subsys_state *css; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) { cgroup_taskset_for_each(task, css, tset) { if (!sched_rt_can_attach(css_tg(css), task)) if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; return -EINVAL; } } return 0; return 0; } } #endif #endif static void cpu_cgroup_attach(struct cgroup_taskset *tset) static void cpu_cgroup_attach(struct cgroup_taskset *tset) { { struct task_struct *task; struct task_struct *task; struct cgroup_subsys_state *css; struct cgroup_subsys_state *css; cgroup_taskset_for_each(task, css, tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); sched_move_task(task); } } #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP static void cpu_util_update_eff(struct cgroup_subsys_state *c static void cpu_util_update_eff(struct cgroup_subsys_state *c { { struct cgroup_subsys_state *top_css = css; struct cgroup_subsys_state *top_css = css; struct uclamp_se *uc_parent = NULL; struct uclamp_se *uc_parent = NULL; struct uclamp_se *uc_se = NULL; struct uclamp_se *uc_se = NULL; unsigned int eff[UCLAMP_CNT]; unsigned int eff[UCLAMP_CNT]; enum uclamp_id clamp_id; enum uclamp_id clamp_id; unsigned int clamps; unsigned int clamps; lockdep_assert_held(&uclamp_mutex); lockdep_assert_held(&uclamp_mutex); SCHED_WARN_ON(!rcu_read_lock_held()); SCHED_WARN_ON(!rcu_read_lock_held()); css_for_each_descendant_pre(css, top_css) { css_for_each_descendant_pre(css, top_css) { uc_parent = css_tg(css)->parent uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL; ? css_tg(css)->parent->uclamp : NULL; for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { /* Assume effective clamps matches re /* Assume effective clamps matches re eff[clamp_id] = css_tg(css)->uclamp_r eff[clamp_id] = css_tg(css)->uclamp_r /* Cap effective clamps with parent's /* Cap effective clamps with parent's if (uc_parent && if (uc_parent && eff[clamp_id] > uc_parent[clamp_i eff[clamp_id] > uc_parent[clamp_i eff[clamp_id] = uc_parent[cla eff[clamp_id] = uc_parent[cla } } } } /* Ensure protection is always capped by limi /* Ensure protection is always capped by limi eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UC eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UC /* Propagate most restrictive effective clamp /* Propagate most restrictive effective clamp clamps = 0x0; clamps = 0x0; uc_se = css_tg(css)->uclamp; uc_se = css_tg(css)->uclamp; for_each_clamp_id(clamp_id) { for_each_clamp_id(clamp_id) { if (eff[clamp_id] == uc_se[clamp_id]. if (eff[clamp_id] == uc_se[clamp_id]. continue; continue; uc_se[clamp_id].value = eff[clamp_id] uc_se[clamp_id].value = eff[clamp_id] uc_se[clamp_id].bucket_id = uclamp_bu uc_se[clamp_id].bucket_id = uclamp_bu clamps |= (0x1 << clamp_id); clamps |= (0x1 << clamp_id); } } if (!clamps) { if (!clamps) { css = css_rightmost_descendant(css); css = css_rightmost_descendant(css); continue; continue; } } /* Immediately update descendants RUNNABLE ta /* Immediately update descendants RUNNABLE ta uclamp_update_active_tasks(css); uclamp_update_active_tasks(css); } } } } /* /* * Integer 10^N with a given N exponent by casting to integer * Integer 10^N with a given N exponent by casting to integer * C expression. Since there is no way to convert a macro arg * C expression. Since there is no way to convert a macro arg * character constant, use two levels of macros. * character constant, use two levels of macros. */ */ #define _POW10(exp) ((unsigned int)1e##exp) #define _POW10(exp) ((unsigned int)1e##exp) #define POW10(exp) _POW10(exp) #define POW10(exp) _POW10(exp) struct uclamp_request { struct uclamp_request { #define UCLAMP_PERCENT_SHIFT 2 #define UCLAMP_PERCENT_SHIFT 2 #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_S #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_S s64 percent; s64 percent; u64 util; u64 util; int ret; int ret; }; }; static inline struct uclamp_request static inline struct uclamp_request capacity_from_percent(char *buf) capacity_from_percent(char *buf) { { struct uclamp_request req = { struct uclamp_request req = { .percent = UCLAMP_PERCENT_SCALE, .percent = UCLAMP_PERCENT_SCALE, .util = SCHED_CAPACITY_SCALE, .util = SCHED_CAPACITY_SCALE, .ret = 0, .ret = 0, }; }; buf = strim(buf); buf = strim(buf); if (strcmp(buf, "max")) { if (strcmp(buf, "max")) { req.ret = cgroup_parse_float(buf, UCLAMP_PERC req.ret = cgroup_parse_float(buf, UCLAMP_PERC &req.percent); &req.percent); if (req.ret) if (req.ret) return req; return req; if ((u64)req.percent > UCLAMP_PERCENT_SCALE) if ((u64)req.percent > UCLAMP_PERCENT_SCALE) req.ret = -ERANGE; req.ret = -ERANGE; return req; return req; } } req.util = req.percent << SCHED_CAPACITY_SHIF req.util = req.percent << SCHED_CAPACITY_SHIF req.util = DIV_ROUND_CLOSEST_ULL(req.util, UC req.util = DIV_ROUND_CLOSEST_ULL(req.util, UC } } return req; return req; } } static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, size_t nbytes, loff_t off, size_t nbytes, loff_t off, enum uclamp_id clamp_id) enum uclamp_id clamp_id) { { struct uclamp_request req; struct uclamp_request req; struct task_group *tg; struct task_group *tg; req = capacity_from_percent(buf); req = capacity_from_percent(buf); if (req.ret) if (req.ret) return req.ret; return req.ret; static_branch_enable(&sched_uclamp_used); static_branch_enable(&sched_uclamp_used); mutex_lock(&uclamp_mutex); mutex_lock(&uclamp_mutex); rcu_read_lock(); rcu_read_lock(); tg = css_tg(of_css(of)); tg = css_tg(of_css(of)); if (tg->uclamp_req[clamp_id].value != req.util) if (tg->uclamp_req[clamp_id].value != req.util) uclamp_se_set(&tg->uclamp_req[clamp_id], req. uclamp_se_set(&tg->uclamp_req[clamp_id], req. /* /* * Because of not recoverable conversion rounding we * Because of not recoverable conversion rounding we * exact requested value * exact requested value */ */ tg->uclamp_pct[clamp_id] = req.percent; tg->uclamp_pct[clamp_id] = req.percent; /* Update effective clamps to track the most restrict /* Update effective clamps to track the most restrict cpu_util_update_eff(of_css(of)); cpu_util_update_eff(of_css(of)); rcu_read_unlock(); rcu_read_unlock(); mutex_unlock(&uclamp_mutex); mutex_unlock(&uclamp_mutex); return nbytes; return nbytes; } } static ssize_t cpu_uclamp_min_write(struct kernfs_open_file * static ssize_t cpu_uclamp_min_write(struct kernfs_open_file * char *buf, size_t nbytes, char *buf, size_t nbytes, loff_t off) loff_t off) { { return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_ } } static ssize_t cpu_uclamp_max_write(struct kernfs_open_file * static ssize_t cpu_uclamp_max_write(struct kernfs_open_file * char *buf, size_t nbytes, char *buf, size_t nbytes, loff_t off) loff_t off) { { return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_ return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_ } } static inline void cpu_uclamp_print(struct seq_file *sf, static inline void cpu_uclamp_print(struct seq_file *sf, enum uclamp_id clamp_id) enum uclamp_id clamp_id) { { struct task_group *tg; struct task_group *tg; u64 util_clamp; u64 util_clamp; u64 percent; u64 percent; u32 rem; u32 rem; rcu_read_lock(); rcu_read_lock(); tg = css_tg(seq_css(sf)); tg = css_tg(seq_css(sf)); util_clamp = tg->uclamp_req[clamp_id].value; util_clamp = tg->uclamp_req[clamp_id].value; rcu_read_unlock(); rcu_read_unlock(); if (util_clamp == SCHED_CAPACITY_SCALE) { if (util_clamp == SCHED_CAPACITY_SCALE) { seq_puts(sf, "max\n"); seq_puts(sf, "max\n"); return; return; } } percent = tg->uclamp_pct[clamp_id]; percent = tg->uclamp_pct[clamp_id]; percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_S percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_S seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT } } static int cpu_uclamp_min_show(struct seq_file *sf, void *v) static int cpu_uclamp_min_show(struct seq_file *sf, void *v) { { cpu_uclamp_print(sf, UCLAMP_MIN); cpu_uclamp_print(sf, UCLAMP_MIN); return 0; return 0; } } static int cpu_uclamp_max_show(struct seq_file *sf, void *v) static int cpu_uclamp_max_show(struct seq_file *sf, void *v) { { cpu_uclamp_print(sf, UCLAMP_MAX); cpu_uclamp_print(sf, UCLAMP_MAX); return 0; return 0; } } #endif /* CONFIG_UCLAMP_TASK_GROUP */ #endif /* CONFIG_UCLAMP_TASK_GROUP */ #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED static int cpu_shares_write_u64(struct cgroup_subsys_state *c static int cpu_shares_write_u64(struct cgroup_subsys_state *c struct cftype *cftype, u64 sh struct cftype *cftype, u64 sh { { if (shareval > scale_load_down(ULONG_MAX)) if (shareval > scale_load_down(ULONG_MAX)) shareval = MAX_SHARES; shareval = MAX_SHARES; return sched_group_set_shares(css_tg(css), scale_load return sched_group_set_shares(css_tg(css), scale_load } } static u64 cpu_shares_read_u64(struct cgroup_subsys_state *cs static u64 cpu_shares_read_u64(struct cgroup_subsys_state *cs struct cftype *cft) struct cftype *cft) { { struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); return (u64) scale_load_down(tg->shares); } } #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH static DEFINE_MUTEX(cfs_constraints_mutex); static DEFINE_MUTEX(cfs_constraints_mutex); const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */ static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* /* More than 203 days if BW_SHIFT equals 20. */ /* More than 203 days if BW_SHIFT equals 20. */ static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC; static int __cfs_schedulable(struct task_group *tg, u64 perio static int __cfs_schedulable(struct task_group *tg, u64 perio static int tg_set_cfs_bandwidth(struct task_group *tg, u64 pe static int tg_set_cfs_bandwidth(struct task_group *tg, u64 pe u64 burst) u64 burst) { { int i, ret = 0, runtime_enabled, runtime_was_enabled; int i, ret = 0, runtime_enabled, runtime_was_enabled; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; if (tg == &root_task_group) if (tg == &root_task_group) return -EINVAL; return -EINVAL; /* /* * Ensure we have at some amount of bandwidth every p * Ensure we have at some amount of bandwidth every p * to prevent reaching a state of large arrears when * to prevent reaching a state of large arrears when * entity_tick() resulting in prolonged exit starvati * entity_tick() resulting in prolonged exit starvati */ */ if (quota < min_cfs_quota_period || period < min_cfs_ if (quota < min_cfs_quota_period || period < min_cfs_ return -EINVAL; return -EINVAL; /* /* * Likewise, bound things on the other side by preven * Likewise, bound things on the other side by preven * periods. This also allows us to normalize in comp * periods. This also allows us to normalize in comp * feasibility. * feasibility. */ */ if (period > max_cfs_quota_period) if (period > max_cfs_quota_period) return -EINVAL; return -EINVAL; /* /* * Bound quota to defend quota against overflow durin * Bound quota to defend quota against overflow durin */ */ if (quota != RUNTIME_INF && quota > max_cfs_runtime) if (quota != RUNTIME_INF && quota > max_cfs_runtime) return -EINVAL; return -EINVAL; if (quota != RUNTIME_INF && (burst > quota || if (quota != RUNTIME_INF && (burst > quota || burst + quota > max_cfs_ burst + quota > max_cfs_ return -EINVAL; return -EINVAL; /* /* * Prevent race between setting of cfs_rq->runtime_en * Prevent race between setting of cfs_rq->runtime_en * unthrottle_offline_cfs_rqs(). * unthrottle_offline_cfs_rqs(). */ */ guard(cpus_read_lock)(); guard(cpus_read_lock)(); guard(mutex)(&cfs_constraints_mutex); guard(mutex)(&cfs_constraints_mutex); ret = __cfs_schedulable(tg, period, quota); ret = __cfs_schedulable(tg, period, quota); if (ret) if (ret) return ret; return ret; runtime_enabled = quota != RUNTIME_INF; runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; /* /* * If we need to toggle cfs_bandwidth_used, off->on m * If we need to toggle cfs_bandwidth_used, off->on m * before making related changes, and on->off must oc * before making related changes, and on->off must oc */ */ if (runtime_enabled && !runtime_was_enabled) if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); cfs_bandwidth_usage_inc(); scoped_guard (raw_spinlock_irq, &cfs_b->lock) { scoped_guard (raw_spinlock_irq, &cfs_b->lock) { cfs_b->period = ns_to_ktime(period); cfs_b->period = ns_to_ktime(period); cfs_b->quota = quota; cfs_b->quota = quota; cfs_b->burst = burst; cfs_b->burst = burst; __refill_cfs_bandwidth_runtime(cfs_b); __refill_cfs_bandwidth_runtime(cfs_b); /* /* * Restart the period timer (if active) to ha * Restart the period timer (if active) to ha * period expiry: * period expiry: */ */ if (runtime_enabled) if (runtime_enabled) start_cfs_bandwidth(cfs_b); start_cfs_bandwidth(cfs_b); } } for_each_online_cpu(i) { for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; struct rq *rq = cfs_rq->rq; guard(rq_lock_irq)(rq); guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); unthrottle_cfs_rq(cfs_rq); } } if (runtime_was_enabled && !runtime_enabled) if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); cfs_bandwidth_usage_dec(); return 0; return 0; } } static int tg_set_cfs_quota(struct task_group *tg, long cfs_q static int tg_set_cfs_quota(struct task_group *tg, long cfs_q { { u64 quota, period, burst; u64 quota, period, burst; period = ktime_to_ns(tg->cfs_bandwidth.period); period = ktime_to_ns(tg->cfs_bandwidth.period); burst = tg->cfs_bandwidth.burst; burst = tg->cfs_bandwidth.burst; if (cfs_quota_us < 0) if (cfs_quota_us < 0) quota = RUNTIME_INF; quota = RUNTIME_INF; else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC quota = (u64)cfs_quota_us * NSEC_PER_USEC; quota = (u64)cfs_quota_us * NSEC_PER_USEC; else else return -EINVAL; return -EINVAL; return tg_set_cfs_bandwidth(tg, period, quota, burst) return tg_set_cfs_bandwidth(tg, period, quota, burst) } } static long tg_get_cfs_quota(struct task_group *tg) static long tg_get_cfs_quota(struct task_group *tg) { { u64 quota_us; u64 quota_us; if (tg->cfs_bandwidth.quota == RUNTIME_INF) if (tg->cfs_bandwidth.quota == RUNTIME_INF) return -1; return -1; quota_us = tg->cfs_bandwidth.quota; quota_us = tg->cfs_bandwidth.quota; do_div(quota_us, NSEC_PER_USEC); do_div(quota_us, NSEC_PER_USEC); return quota_us; return quota_us; } } static int tg_set_cfs_period(struct task_group *tg, long cfs_ static int tg_set_cfs_period(struct task_group *tg, long cfs_ { { u64 quota, period, burst; u64 quota, period, burst; if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) return -EINVAL; return -EINVAL; period = (u64)cfs_period_us * NSEC_PER_USEC; period = (u64)cfs_period_us * NSEC_PER_USEC; quota = tg->cfs_bandwidth.quota; quota = tg->cfs_bandwidth.quota; burst = tg->cfs_bandwidth.burst; burst = tg->cfs_bandwidth.burst; return tg_set_cfs_bandwidth(tg, period, quota, burst) return tg_set_cfs_bandwidth(tg, period, quota, burst) } } static long tg_get_cfs_period(struct task_group *tg) static long tg_get_cfs_period(struct task_group *tg) { { u64 cfs_period_us; u64 cfs_period_us; cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period) cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period) do_div(cfs_period_us, NSEC_PER_USEC); do_div(cfs_period_us, NSEC_PER_USEC); return cfs_period_us; return cfs_period_us; } } static int tg_set_cfs_burst(struct task_group *tg, long cfs_b static int tg_set_cfs_burst(struct task_group *tg, long cfs_b { { u64 quota, period, burst; u64 quota, period, burst; if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC) return -EINVAL; return -EINVAL; burst = (u64)cfs_burst_us * NSEC_PER_USEC; burst = (u64)cfs_burst_us * NSEC_PER_USEC; period = ktime_to_ns(tg->cfs_bandwidth.period); period = ktime_to_ns(tg->cfs_bandwidth.period); quota = tg->cfs_bandwidth.quota; quota = tg->cfs_bandwidth.quota; return tg_set_cfs_bandwidth(tg, period, quota, burst) return tg_set_cfs_bandwidth(tg, period, quota, burst) } } static long tg_get_cfs_burst(struct task_group *tg) static long tg_get_cfs_burst(struct task_group *tg) { { u64 burst_us; u64 burst_us; burst_us = tg->cfs_bandwidth.burst; burst_us = tg->cfs_bandwidth.burst; do_div(burst_us, NSEC_PER_USEC); do_div(burst_us, NSEC_PER_USEC); return burst_us; return burst_us; } } static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state struct cftype *cft) struct cftype *cft) { { return tg_get_cfs_quota(css_tg(css)); return tg_get_cfs_quota(css_tg(css)); } } static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state struct cftype *cftype, s64 struct cftype *cftype, s64 { { return tg_set_cfs_quota(css_tg(css), cfs_quota_us); return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } } static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state struct cftype *cft) struct cftype *cft) { { return tg_get_cfs_period(css_tg(css)); return tg_get_cfs_period(css_tg(css)); } } static int cpu_cfs_period_write_u64(struct cgroup_subsys_stat static int cpu_cfs_period_write_u64(struct cgroup_subsys_stat struct cftype *cftype, u6 struct cftype *cftype, u6 { { return tg_set_cfs_period(css_tg(css), cfs_period_us); return tg_set_cfs_period(css_tg(css), cfs_period_us); } } static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state struct cftype *cft) struct cftype *cft) { { return tg_get_cfs_burst(css_tg(css)); return tg_get_cfs_burst(css_tg(css)); } } static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state struct cftype *cftype, u64 struct cftype *cftype, u64 { { return tg_set_cfs_burst(css_tg(css), cfs_burst_us); return tg_set_cfs_burst(css_tg(css), cfs_burst_us); } } struct cfs_schedulable_data { struct cfs_schedulable_data { struct task_group *tg; struct task_group *tg; u64 period, quota; u64 period, quota; }; }; /* /* * normalize group quota/period to be quota/max_period * normalize group quota/period to be quota/max_period * note: units are usecs * note: units are usecs */ */ static u64 normalize_cfs_quota(struct task_group *tg, static u64 normalize_cfs_quota(struct task_group *tg, struct cfs_schedulable_data *d struct cfs_schedulable_data *d { { u64 quota, period; u64 quota, period; if (tg == d->tg) { if (tg == d->tg) { period = d->period; period = d->period; quota = d->quota; quota = d->quota; } else { } else { period = tg_get_cfs_period(tg); period = tg_get_cfs_period(tg); quota = tg_get_cfs_quota(tg); quota = tg_get_cfs_quota(tg); } } /* note: these should typically be equivalent */ /* note: these should typically be equivalent */ if (quota == RUNTIME_INF || quota == -1) if (quota == RUNTIME_INF || quota == -1) return RUNTIME_INF; return RUNTIME_INF; return to_ratio(period, quota); return to_ratio(period, quota); } } static int tg_cfs_schedulable_down(struct task_group *tg, voi static int tg_cfs_schedulable_down(struct task_group *tg, voi { { struct cfs_schedulable_data *d = data; struct cfs_schedulable_data *d = data; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; s64 quota = 0, parent_quota = -1; s64 quota = 0, parent_quota = -1; if (!tg->parent) { if (!tg->parent) { quota = RUNTIME_INF; quota = RUNTIME_INF; } else { } else { struct cfs_bandwidth *parent_b = &tg->parent- struct cfs_bandwidth *parent_b = &tg->parent- quota = normalize_cfs_quota(tg, d); quota = normalize_cfs_quota(tg, d); parent_quota = parent_b->hierarchical_quota; parent_quota = parent_b->hierarchical_quota; /* /* * Ensure max(child_quota) <= parent_quota. * Ensure max(child_quota) <= parent_quota. * always take the non-RUNTIME_INF min. On c * always take the non-RUNTIME_INF min. On c * inherit when no limit is set. In both case * inherit when no limit is set. In both case * by the scheduler to determine if a given C * by the scheduler to determine if a given C * bandwidth constraint at some higher level. * bandwidth constraint at some higher level. */ */ if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) { if (quota == RUNTIME_INF) if (quota == RUNTIME_INF) quota = parent_quota; quota = parent_quota; else if (parent_quota != RUNTIME_INF) else if (parent_quota != RUNTIME_INF) quota = min(quota, parent_quo quota = min(quota, parent_quo } else { } else { if (quota == RUNTIME_INF) if (quota == RUNTIME_INF) quota = parent_quota; quota = parent_quota; else if (parent_quota != RUNTIME_INF else if (parent_quota != RUNTIME_INF return -EINVAL; return -EINVAL; } } } } cfs_b->hierarchical_quota = quota; cfs_b->hierarchical_quota = quota; return 0; return 0; } } static int __cfs_schedulable(struct task_group *tg, u64 perio static int __cfs_schedulable(struct task_group *tg, u64 perio { { int ret; int ret; struct cfs_schedulable_data data = { struct cfs_schedulable_data data = { .tg = tg, .tg = tg, .period = period, .period = period, .quota = quota, .quota = quota, }; }; if (quota != RUNTIME_INF) { if (quota != RUNTIME_INF) { do_div(data.period, NSEC_PER_USEC); do_div(data.period, NSEC_PER_USEC); do_div(data.quota, NSEC_PER_USEC); do_div(data.quota, NSEC_PER_USEC); } } rcu_read_lock(); rcu_read_lock(); ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, & ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, & rcu_read_unlock(); rcu_read_unlock(); return ret; return ret; } } static int cpu_cfs_stat_show(struct seq_file *sf, void *v) static int cpu_cfs_stat_show(struct seq_file *sf, void *v) { { struct task_group *tg = css_tg(seq_css(sf)); struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods); seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttl seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttl seq_printf(sf, "throttled_time %llu\n", cfs_b->thrott seq_printf(sf, "throttled_time %llu\n", cfs_b->thrott if (schedstat_enabled() && tg != &root_task_group) { if (schedstat_enabled() && tg != &root_task_group) { struct sched_statistics *stats; struct sched_statistics *stats; u64 ws = 0; u64 ws = 0; int i; int i; for_each_possible_cpu(i) { for_each_possible_cpu(i) { stats = __schedstats_from_se(tg->se[i stats = __schedstats_from_se(tg->se[i ws += schedstat_val(stats->wait_sum); ws += schedstat_val(stats->wait_sum); } } seq_printf(sf, "wait_sum %llu\n", ws); seq_printf(sf, "wait_sum %llu\n", ws); } } seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst); seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst); seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time return 0; return 0; } } static u64 throttled_time_self(struct task_group *tg) static u64 throttled_time_self(struct task_group *tg) { { int i; int i; u64 total = 0; u64 total = 0; for_each_possible_cpu(i) { for_each_possible_cpu(i) { total += READ_ONCE(tg->cfs_rq[i]->throttled_c total += READ_ONCE(tg->cfs_rq[i]->throttled_c } } return total; return total; } } static int cpu_cfs_local_stat_show(struct seq_file *sf, void static int cpu_cfs_local_stat_show(struct seq_file *sf, void { { struct task_group *tg = css_tg(seq_css(sf)); struct task_group *tg = css_tg(seq_css(sf)); seq_printf(sf, "throttled_time %llu\n", throttled_tim seq_printf(sf, "throttled_time %llu\n", throttled_tim return 0; return 0; } } #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED static int cpu_rt_runtime_write(struct cgroup_subsys_state *c static int cpu_rt_runtime_write(struct cgroup_subsys_state *c struct cftype *cft, s64 val) struct cftype *cft, s64 val) { { return sched_group_set_rt_runtime(css_tg(css), val); return sched_group_set_rt_runtime(css_tg(css), val); } } static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *cs static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *cs struct cftype *cft) struct cftype *cft) { { return sched_group_rt_runtime(css_tg(css)); return sched_group_rt_runtime(css_tg(css)); } } static int cpu_rt_period_write_uint(struct cgroup_subsys_stat static int cpu_rt_period_write_uint(struct cgroup_subsys_stat struct cftype *cftype, u6 struct cftype *cftype, u6 { { return sched_group_set_rt_period(css_tg(css), rt_peri return sched_group_set_rt_period(css_tg(css), rt_peri } } static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state struct cftype *cft) struct cftype *cft) { { return sched_group_rt_period(css_tg(css)); return sched_group_rt_period(css_tg(css)); } } #endif /* CONFIG_RT_GROUP_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) struct cftype *cft) { { return css_tg(css)->idle; return css_tg(css)->idle; } } static int cpu_idle_write_s64(struct cgroup_subsys_state *css static int cpu_idle_write_s64(struct cgroup_subsys_state *css struct cftype *cft, s64 idle) struct cftype *cft, s64 idle) { { return sched_group_set_idle(css_tg(css), idle); return sched_group_set_idle(css_tg(css), idle); } } #endif #endif static struct cftype cpu_legacy_files[] = { static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED { { .name = "shares", .name = "shares", .read_u64 = cpu_shares_read_u64, .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, .write_u64 = cpu_shares_write_u64, }, }, { { .name = "idle", .name = "idle", .read_s64 = cpu_idle_read_s64, .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, .write_s64 = cpu_idle_write_s64, }, }, #endif #endif #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH { { .name = "cfs_quota_us", .name = "cfs_quota_us", .read_s64 = cpu_cfs_quota_read_s64, .read_s64 = cpu_cfs_quota_read_s64, .write_s64 = cpu_cfs_quota_write_s64, .write_s64 = cpu_cfs_quota_write_s64, }, }, { { .name = "cfs_period_us", .name = "cfs_period_us", .read_u64 = cpu_cfs_period_read_u64, .read_u64 = cpu_cfs_period_read_u64, .write_u64 = cpu_cfs_period_write_u64, .write_u64 = cpu_cfs_period_write_u64, }, }, { { .name = "cfs_burst_us", .name = "cfs_burst_us", .read_u64 = cpu_cfs_burst_read_u64, .read_u64 = cpu_cfs_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, .write_u64 = cpu_cfs_burst_write_u64, }, }, { { .name = "stat", .name = "stat", .seq_show = cpu_cfs_stat_show, .seq_show = cpu_cfs_stat_show, }, }, { { .name = "stat.local", .name = "stat.local", .seq_show = cpu_cfs_local_stat_show, .seq_show = cpu_cfs_local_stat_show, }, }, #endif #endif #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED { { .name = "rt_runtime_us", .name = "rt_runtime_us", .read_s64 = cpu_rt_runtime_read, .read_s64 = cpu_rt_runtime_read, .write_s64 = cpu_rt_runtime_write, .write_s64 = cpu_rt_runtime_write, }, }, { { .name = "rt_period_us", .name = "rt_period_us", .read_u64 = cpu_rt_period_read_uint, .read_u64 = cpu_rt_period_read_uint, .write_u64 = cpu_rt_period_write_uint, .write_u64 = cpu_rt_period_write_uint, }, }, #endif #endif #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP { { .name = "uclamp.min", .name = "uclamp.min", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_min_show, .seq_show = cpu_uclamp_min_show, .write = cpu_uclamp_min_write, .write = cpu_uclamp_min_write, }, }, { { .name = "uclamp.max", .name = "uclamp.max", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_max_show, .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, .write = cpu_uclamp_max_write, }, }, #endif #endif { } /* Terminate */ { } /* Terminate */ }; }; static int cpu_extra_stat_show(struct seq_file *sf, static int cpu_extra_stat_show(struct seq_file *sf, struct cgroup_subsys_state *cs struct cgroup_subsys_state *cs { { #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH { { struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwi struct cfs_bandwidth *cfs_b = &tg->cfs_bandwi u64 throttled_usec, burst_usec; u64 throttled_usec, burst_usec; throttled_usec = cfs_b->throttled_time; throttled_usec = cfs_b->throttled_time; do_div(throttled_usec, NSEC_PER_USEC); do_div(throttled_usec, NSEC_PER_USEC); burst_usec = cfs_b->burst_time; burst_usec = cfs_b->burst_time; do_div(burst_usec, NSEC_PER_USEC); do_div(burst_usec, NSEC_PER_USEC); seq_printf(sf, "nr_periods %d\n" seq_printf(sf, "nr_periods %d\n" "nr_throttled %d\n" "nr_throttled %d\n" "throttled_usec %llu\n" "throttled_usec %llu\n" "nr_bursts %d\n" "nr_bursts %d\n" "burst_usec %llu\n", "burst_usec %llu\n", cfs_b->nr_periods, cfs_b->nr_throt cfs_b->nr_periods, cfs_b->nr_throt throttled_usec, cfs_b->nr_burst, b throttled_usec, cfs_b->nr_burst, b } } #endif #endif return 0; return 0; } } static int cpu_local_stat_show(struct seq_file *sf, static int cpu_local_stat_show(struct seq_file *sf, struct cgroup_subsys_state *cs struct cgroup_subsys_state *cs { { #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH { { struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css); u64 throttled_self_usec; u64 throttled_self_usec; throttled_self_usec = throttled_time_self(tg) throttled_self_usec = throttled_time_self(tg) do_div(throttled_self_usec, NSEC_PER_USEC); do_div(throttled_self_usec, NSEC_PER_USEC); seq_printf(sf, "throttled_usec %llu\n", seq_printf(sf, "throttled_usec %llu\n", throttled_self_usec); throttled_self_usec); } } #endif #endif return 0; return 0; } } #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED static u64 cpu_weight_read_u64(struct cgroup_subsys_state *cs static u64 cpu_weight_read_u64(struct cgroup_subsys_state *cs struct cftype *cft) struct cftype *cft) { { struct task_group *tg = css_tg(css); struct task_group *tg = css_tg(css); u64 weight = scale_load_down(tg->shares); u64 weight = scale_load_down(tg->shares); return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_D return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_D } } static int cpu_weight_write_u64(struct cgroup_subsys_state *c static int cpu_weight_write_u64(struct cgroup_subsys_state *c struct cftype *cft, u64 weigh struct cftype *cft, u64 weigh { { /* /* * cgroup weight knobs should use the common MIN, DFL * cgroup weight knobs should use the common MIN, DFL * values which are 1, 100 and 10000 respectively. W * values which are 1, 100 and 10000 respectively. W * a bit of range on both ends, it maps pretty well o * a bit of range on both ends, it maps pretty well o * value used by scheduler and the round-trip convers * value used by scheduler and the round-trip convers * the original value over the entire range. * the original value over the entire range. */ */ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEI if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEI return -ERANGE; return -ERANGE; weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_ return sched_group_set_shares(css_tg(css), scale_load return sched_group_set_shares(css_tg(css), scale_load } } static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_stat static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_stat struct cftype *cft) struct cftype *cft) { { unsigned long weight = scale_load_down(css_tg(css)->s unsigned long weight = scale_load_down(css_tg(css)->s int last_delta = INT_MAX; int last_delta = INT_MAX; int prio, delta; int prio, delta; /* find the closest nice value to the current weight /* find the closest nice value to the current weight for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight delta = abs(sched_prio_to_weight[prio] - weig delta = abs(sched_prio_to_weight[prio] - weig if (delta >= last_delta) if (delta >= last_delta) break; break; last_delta = delta; last_delta = delta; } } return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); } } static int cpu_weight_nice_write_s64(struct cgroup_subsys_sta static int cpu_weight_nice_write_s64(struct cgroup_subsys_sta struct cftype *cft, s64 struct cftype *cft, s64 { { unsigned long weight; unsigned long weight; int idx; int idx; if (nice < MIN_NICE || nice > MAX_NICE) if (nice < MIN_NICE || nice > MAX_NICE) return -ERANGE; return -ERANGE; idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO; idx = array_index_nospec(idx, 40); idx = array_index_nospec(idx, 40); weight = sched_prio_to_weight[idx]; weight = sched_prio_to_weight[idx]; return sched_group_set_shares(css_tg(css), scale_load return sched_group_set_shares(css_tg(css), scale_load } } #endif #endif static void __maybe_unused cpu_period_quota_print(struct seq_ static void __maybe_unused cpu_period_quota_print(struct seq_ long period long period { { if (quota < 0) if (quota < 0) seq_puts(sf, "max"); seq_puts(sf, "max"); else else seq_printf(sf, "%ld", quota); seq_printf(sf, "%ld", quota); seq_printf(sf, " %ld\n", period); seq_printf(sf, " %ld\n", period); } } /* caller should put the current value in *@periodp before ca /* caller should put the current value in *@periodp before ca static int __maybe_unused cpu_period_quota_parse(char *buf, static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *periodp u64 *periodp { { char tok[21]; /* U64_MAX */ char tok[21]; /* U64_MAX */ if (sscanf(buf, "%20s %llu", tok, periodp) < 1) if (sscanf(buf, "%20s %llu", tok, periodp) < 1) return -EINVAL; return -EINVAL; *periodp *= NSEC_PER_USEC; *periodp *= NSEC_PER_USEC; if (sscanf(tok, "%llu", quotap)) if (sscanf(tok, "%llu", quotap)) *quotap *= NSEC_PER_USEC; *quotap *= NSEC_PER_USEC; else if (!strcmp(tok, "max")) else if (!strcmp(tok, "max")) *quotap = RUNTIME_INF; *quotap = RUNTIME_INF; else else return -EINVAL; return -EINVAL; return 0; return 0; } } #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH static int cpu_max_show(struct seq_file *sf, void *v) static int cpu_max_show(struct seq_file *sf, void *v) { { struct task_group *tg = css_tg(seq_css(sf)); struct task_group *tg = css_tg(seq_css(sf)); cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_ return 0; return 0; } } static ssize_t cpu_max_write(struct kernfs_open_file *of, static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t char *buf, size_t nbytes, loff_t { { struct task_group *tg = css_tg(of_css(of)); struct task_group *tg = css_tg(of_css(of)); u64 period = tg_get_cfs_period(tg); u64 period = tg_get_cfs_period(tg); u64 burst = tg_get_cfs_burst(tg); u64 burst = tg_get_cfs_burst(tg); u64 quota; u64 quota; int ret; int ret; ret = cpu_period_quota_parse(buf, &period, "a); ret = cpu_period_quota_parse(buf, &period, "a); if (!ret) if (!ret) ret = tg_set_cfs_bandwidth(tg, period, quota, ret = tg_set_cfs_bandwidth(tg, period, quota, return ret ?: nbytes; return ret ?: nbytes; } } #endif #endif static struct cftype cpu_files[] = { static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED { { .name = "weight", .name = "weight", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_weight_read_u64, .read_u64 = cpu_weight_read_u64, .write_u64 = cpu_weight_write_u64, .write_u64 = cpu_weight_write_u64, }, }, { { .name = "weight.nice", .name = "weight.nice", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_weight_nice_read_s64, .read_s64 = cpu_weight_nice_read_s64, .write_s64 = cpu_weight_nice_write_s64, .write_s64 = cpu_weight_nice_write_s64, }, }, { { .name = "idle", .name = "idle", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .read_s64 = cpu_idle_read_s64, .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, .write_s64 = cpu_idle_write_s64, }, }, #endif #endif #ifdef CONFIG_CFS_BANDWIDTH #ifdef CONFIG_CFS_BANDWIDTH { { .name = "max", .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_max_show, .seq_show = cpu_max_show, .write = cpu_max_write, .write = cpu_max_write, }, }, { { .name = "max.burst", .name = "max.burst", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = cpu_cfs_burst_read_u64, .read_u64 = cpu_cfs_burst_read_u64, .write_u64 = cpu_cfs_burst_write_u64, .write_u64 = cpu_cfs_burst_write_u64, }, }, #endif #endif #ifdef CONFIG_UCLAMP_TASK_GROUP #ifdef CONFIG_UCLAMP_TASK_GROUP { { .name = "uclamp.min", .name = "uclamp.min", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_min_show, .seq_show = cpu_uclamp_min_show, .write = cpu_uclamp_min_write, .write = cpu_uclamp_min_write, }, }, { { .name = "uclamp.max", .name = "uclamp.max", .flags = CFTYPE_NOT_ON_ROOT, .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_uclamp_max_show, .seq_show = cpu_uclamp_max_show, .write = cpu_uclamp_max_write, .write = cpu_uclamp_max_write, }, }, #endif #endif { } /* terminate */ { } /* terminate */ }; }; struct cgroup_subsys cpu_cgrp_subsys = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, .css_free = cpu_cgroup_css_free, .css_extra_stat_show = cpu_extra_stat_show, .css_extra_stat_show = cpu_extra_stat_show, .css_local_stat_show = cpu_local_stat_show, .css_local_stat_show = cpu_local_stat_show, #ifdef CONFIG_RT_GROUP_SCHED #ifdef CONFIG_RT_GROUP_SCHED .can_attach = cpu_cgroup_can_attach, .can_attach = cpu_cgroup_can_attach, #endif #endif .attach = cpu_cgroup_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_legacy_files, .legacy_cftypes = cpu_legacy_files, .dfl_cftypes = cpu_files, .dfl_cftypes = cpu_files, .early_init = true, .early_init = true, .threaded = true, .threaded = true, }; }; #endif /* CONFIG_CGROUP_SCHED */ #endif /* CONFIG_CGROUP_SCHED */ void dump_cpu_task(int cpu) void dump_cpu_task(int cpu) { { if (cpu == smp_processor_id() && in_hardirq()) { if (cpu == smp_processor_id() && in_hardirq()) { struct pt_regs *regs; struct pt_regs *regs; regs = get_irq_regs(); regs = get_irq_regs(); if (regs) { if (regs) { show_regs(regs); show_regs(regs); return; return; } } } } if (trigger_single_cpu_backtrace(cpu)) if (trigger_single_cpu_backtrace(cpu)) return; return; pr_info("Task dump for CPU %d:\n", cpu); pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); sched_show_task(cpu_curr(cpu)); } } /* /* * Nice levels are multiplicative, with a gentle 10% change f * Nice levels are multiplicative, with a gentle 10% change f * nice level changed. I.e. when a CPU-bound task goes from n * nice level changed. I.e. when a CPU-bound task goes from n * nice 1, it will get ~10% less CPU time than another CPU-bo * nice 1, it will get ~10% less CPU time than another CPU-bo * that remained on nice 0. * that remained on nice 0. * * * The "10% effect" is relative and cumulative: from _any_ ni * The "10% effect" is relative and cumulative: from _any_ ni * if you go up 1 level, it's -10% CPU usage, if you go down * if you go up 1 level, it's -10% CPU usage, if you go down * it's +10% CPU usage. (to achieve that we use a multiplier * it's +10% CPU usage. (to achieve that we use a multiplier * If a task goes up by ~10% and another task goes down by ~1 * If a task goes up by ~10% and another task goes down by ~1 * the relative distance between them is ~25%.) * the relative distance between them is ~25%.) */ */ const int sched_prio_to_weight[40] = { const int sched_prio_to_weight[40] = { /* -20 */ 88761, 71755, 56483, 46273, 36 /* -20 */ 88761, 71755, 56483, 46273, 36 /* -15 */ 29154, 23254, 18705, 14949, 11 /* -15 */ 29154, 23254, 18705, 14949, 11 /* -10 */ 9548, 7620, 6100, 4904, 3 /* -10 */ 9548, 7620, 6100, 4904, 3 /* -5 */ 3121, 2501, 1991, 1586, 1 /* -5 */ 3121, 2501, 1991, 1586, 1 /* 0 */ 1024, 820, 655, 526, /* 0 */ 1024, 820, 655, 526, /* 5 */ 335, 272, 215, 172, /* 5 */ 335, 272, 215, 172, /* 10 */ 110, 87, 70, 56, /* 10 */ 110, 87, 70, 56, /* 15 */ 36, 29, 23, 18, /* 15 */ 36, 29, 23, 18, }; }; /* /* * Inverse (2^32/x) values of the sched_prio_to_weight[] arra * Inverse (2^32/x) values of the sched_prio_to_weight[] arra * * * In cases where the weight does not change often, we can us * In cases where the weight does not change often, we can us * precalculated inverse to speed up arithmetics by turning d * precalculated inverse to speed up arithmetics by turning d * into multiplications: * into multiplications: */ */ const u32 sched_prio_to_wmult[40] = { const u32 sched_prio_to_wmult[40] = { /* -20 */ 48388, 59856, 76040, 92818, 118 /* -20 */ 48388, 59856, 76040, 92818, 118 /* -15 */ 147320, 184698, 229616, 287308, 360 /* -15 */ 147320, 184698, 229616, 287308, 360 /* -10 */ 449829, 563644, 704093, 875809, 1099 /* -10 */ 449829, 563644, 704093, 875809, 1099 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331 }; }; void call_trace_sched_update_nr_running(struct rq *rq, int co void call_trace_sched_update_nr_running(struct rq *rq, int co { { trace_sched_update_nr_running_tp(rq, count); trace_sched_update_nr_running_tp(rq, count); } } #ifdef CONFIG_SCHED_MM_CID #ifdef CONFIG_SCHED_MM_CID /* /* * @cid_lock: Guarantee forward-progress of cid allocation. * @cid_lock: Guarantee forward-progress of cid allocation. * * * Concurrency ID allocation within a bitmap is mostly lock-f * Concurrency ID allocation within a bitmap is mostly lock-f * is only used when contention is detected by the lock-free * is only used when contention is detected by the lock-free * forward progress can be guaranteed. * forward progress can be guaranteed. */ */ DEFINE_RAW_SPINLOCK(cid_lock); DEFINE_RAW_SPINLOCK(cid_lock); /* /* * @use_cid_lock: Select cid allocation behavior: lock-free v * @use_cid_lock: Select cid allocation behavior: lock-free v * * * When @use_cid_lock is 0, the cid allocation is lock-free. * When @use_cid_lock is 0, the cid allocation is lock-free. * detected, it is set to 1 to ensure that all newly coming a * detected, it is set to 1 to ensure that all newly coming a * serialized by @cid_lock until the allocation which detecte * serialized by @cid_lock until the allocation which detecte * completes and sets @use_cid_lock back to 0. This guarantee * completes and sets @use_cid_lock back to 0. This guarantee * of a cid allocation. * of a cid allocation. */ */ int use_cid_lock; int use_cid_lock; /* /* * mm_cid remote-clear implements a lock-free algorithm to cl * mm_cid remote-clear implements a lock-free algorithm to cl * concurrently with respect to the execution of the source r * concurrently with respect to the execution of the source r * switch. * switch. * * * There is one basic properties we want to guarantee here: * There is one basic properties we want to guarantee here: * * * (1) Remote-clear should _never_ mark a per-cpu cid UNSET w * (1) Remote-clear should _never_ mark a per-cpu cid UNSET w * used by a task. That would lead to concurrent allocation o * used by a task. That would lead to concurrent allocation o * userspace corruption. * userspace corruption. * * * Provide this guarantee by introducing a Dekker memory orde * Provide this guarantee by introducing a Dekker memory orde * that a pair of loads observe at least one of a pair of sto * that a pair of loads observe at least one of a pair of sto * shown as: * shown as: * * * X = Y = 0 * X = Y = 0 * * * w[X]=1 w[Y]=1 * w[X]=1 w[Y]=1 * MB MB * MB MB * r[Y]=y r[X]=x * r[Y]=y r[X]=x * * * Which guarantees that x==0 && y==0 is impossible. But rath * Which guarantees that x==0 && y==0 is impossible. But rath * values 0 and 1, this algorithm cares about specific state * values 0 and 1, this algorithm cares about specific state * runqueue current task (as updated by the scheduler context * runqueue current task (as updated by the scheduler context * per-mm/cpu cid value. * per-mm/cpu cid value. * * * Let's introduce task (Y) which has task->mm == mm and task * Let's introduce task (Y) which has task->mm == mm and task * task->mm != mm for the rest of the discussion. There are t * task->mm != mm for the rest of the discussion. There are t * transitions on context switch we care about: * transitions on context switch we care about: * * * (TSA) Store to rq->curr with transition from (N) to (Y) * (TSA) Store to rq->curr with transition from (N) to (Y) * * * (TSB) Store to rq->curr with transition from (Y) to (N) * (TSB) Store to rq->curr with transition from (Y) to (N) * * * On the remote-clear side, there is one transition we care * On the remote-clear side, there is one transition we care * * * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag * * * There is also a transition to UNSET state which can be per * There is also a transition to UNSET state which can be per * sides (scheduler, remote-clear). It is always performed wi * sides (scheduler, remote-clear). It is always performed wi * guarantees that only a single thread will succeed: * guarantees that only a single thread will succeed: * * * (TMB) cmpxchg to *pcpu_cid to mark UNSET * (TMB) cmpxchg to *pcpu_cid to mark UNSET * * * Just to be clear, what we do _not_ want to happen is a tra * Just to be clear, what we do _not_ want to happen is a tra * when a thread is actively using the cid (property (1)). * when a thread is actively using the cid (property (1)). * * * Let's looks at the relevant combinations of TSA/TSB, and T * Let's looks at the relevant combinations of TSA/TSB, and T * * * Scenario A) (TSA)+(TMA) (from next task perspective) * Scenario A) (TSA)+(TMA) (from next task perspective) * * * CPU0 CPU1 * CPU0 CPU1 * * * Context switch CS-1 Remote-clear * Context switch CS-1 Remote-clear * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pc * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pc * (implied barri * (implied barri * - switch_mm_cid() * - switch_mm_cid() * - memory barrier (see switch_mm_cid() * - memory barrier (see switch_mm_cid() * comment explaining how this barrier * comment explaining how this barrier * is combined with other scheduler * is combined with other scheduler * barriers) * barriers) * - mm_cid_get (next) * - mm_cid_get (next) * - READ_ONCE(*pcpu_cid) - rcu_dereferenc * - READ_ONCE(*pcpu_cid) - rcu_dereferenc * * * This Dekker ensures that either task (Y) is observed by th * This Dekker ensures that either task (Y) is observed by th * rcu_dereference() or the LAZY flag is observed by READ_ONC * rcu_dereference() or the LAZY flag is observed by READ_ONC * observed. * observed. * * * If task (Y) store is observed by rcu_dereference(), it mea * If task (Y) store is observed by rcu_dereference(), it mea * still an active task on the cpu. Remote-clear will therefo * still an active task on the cpu. Remote-clear will therefo * to UNSET, which fulfills property (1). * to UNSET, which fulfills property (1). * * * If task (Y) is not observed, but the lazy flag is observed * If task (Y) is not observed, but the lazy flag is observed * it will move its state to UNSET, which clears the percpu c * it will move its state to UNSET, which clears the percpu c * uselessly (which is not an issue for correctness). Because * uselessly (which is not an issue for correctness). Because * observed, CPU1 can move ahead to set the state to UNSET. B * observed, CPU1 can move ahead to set the state to UNSET. B * state to UNSET is done with a cmpxchg expecting that the o * state to UNSET is done with a cmpxchg expecting that the o * LAZY flag set, only one thread will successfully UNSET. * LAZY flag set, only one thread will successfully UNSET. * * * If both states (LAZY flag and task (Y)) are observed, the * If both states (LAZY flag and task (Y)) are observed, the * will observe the LAZY flag and transition to UNSET (perhap * will observe the LAZY flag and transition to UNSET (perhap * CPU1 will observe task (Y) and do nothing more, which is f * CPU1 will observe task (Y) and do nothing more, which is f * * * What we are effectively preventing with this Dekker is a s * What we are effectively preventing with this Dekker is a s * neither LAZY flag nor store (Y) are observed, which would * neither LAZY flag nor store (Y) are observed, which would * because this would UNSET a cid which is actively used. * because this would UNSET a cid which is actively used. */ */ void sched_mm_cid_migrate_from(struct task_struct *t) void sched_mm_cid_migrate_from(struct task_struct *t) { { t->migrate_from_cpu = task_cpu(t); t->migrate_from_cpu = task_cpu(t); } } static static int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, struct task_struct struct task_struct struct mm_cid *src_ struct mm_cid *src_ { { struct mm_struct *mm = t->mm; struct mm_struct *mm = t->mm; struct task_struct *src_task; struct task_struct *src_task; int src_cid, last_mm_cid; int src_cid, last_mm_cid; if (!mm) if (!mm) return -1; return -1; last_mm_cid = t->last_mm_cid; last_mm_cid = t->last_mm_cid; /* /* * If the migrated task has no last cid, or if the cu * If the migrated task has no last cid, or if the cu * task on src rq uses the cid, it means the source c * task on src rq uses the cid, it means the source c * to be moved to the destination cpu. * to be moved to the destination cpu. */ */ if (last_mm_cid == -1) if (last_mm_cid == -1) return -1; return -1; src_cid = READ_ONCE(src_pcpu_cid->cid); src_cid = READ_ONCE(src_pcpu_cid->cid); if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_c if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_c return -1; return -1; /* /* * If we observe an active task using the mm on this * If we observe an active task using the mm on this * are not the last task to be migrated from this cpu * are not the last task to be migrated from this cpu * there is no need to move src_cid to the destinatio * there is no need to move src_cid to the destinatio */ */ rcu_read_lock(); rcu_read_lock(); src_task = rcu_dereference(src_rq->curr); src_task = rcu_dereference(src_rq->curr); if (READ_ONCE(src_task->mm_cid_active) && src_task->m if (READ_ONCE(src_task->mm_cid_active) && src_task->m rcu_read_unlock(); rcu_read_unlock(); t->last_mm_cid = -1; t->last_mm_cid = -1; return -1; return -1; } } rcu_read_unlock(); rcu_read_unlock(); return src_cid; return src_cid; } } static static int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_ struct task_str struct task_str struct mm_cid * struct mm_cid * int src_cid) int src_cid) { { struct task_struct *src_task; struct task_struct *src_task; struct mm_struct *mm = t->mm; struct mm_struct *mm = t->mm; int lazy_cid; int lazy_cid; if (src_cid == -1) if (src_cid == -1) return -1; return -1; /* /* * Attempt to clear the source cpu cid to move it to * Attempt to clear the source cpu cid to move it to * cpu. * cpu. */ */ lazy_cid = mm_cid_set_lazy_put(src_cid); lazy_cid = mm_cid_set_lazy_put(src_cid); if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_c if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_c return -1; return -1; /* /* * The implicit barrier after cmpxchg per-mm/cpu cid * The implicit barrier after cmpxchg per-mm/cpu cid * rq->curr->mm matches the scheduler barrier in cont * rq->curr->mm matches the scheduler barrier in cont * between store to rq->curr and load of prev and nex * between store to rq->curr and load of prev and nex * per-mm/cpu cid. * per-mm/cpu cid. * * * The implicit barrier after cmpxchg per-mm/cpu cid * The implicit barrier after cmpxchg per-mm/cpu cid * rq->curr->mm_cid_active matches the barrier in * rq->curr->mm_cid_active matches the barrier in * sched_mm_cid_exit_signals(), sched_mm_cid_before_e * sched_mm_cid_exit_signals(), sched_mm_cid_before_e * sched_mm_cid_after_execve() between store to t->mm * sched_mm_cid_after_execve() between store to t->mm * load of per-mm/cpu cid. * load of per-mm/cpu cid. */ */ /* /* * If we observe an active task using the mm on this * If we observe an active task using the mm on this * the lazy-put flag, this task will be responsible f * the lazy-put flag, this task will be responsible f * from lazy-put flag set to MM_CID_UNSET. * from lazy-put flag set to MM_CID_UNSET. */ */ rcu_read_lock(); rcu_read_lock(); src_task = rcu_dereference(src_rq->curr); src_task = rcu_dereference(src_rq->curr); if (READ_ONCE(src_task->mm_cid_active) && src_task->m if (READ_ONCE(src_task->mm_cid_active) && src_task->m rcu_read_unlock(); rcu_read_unlock(); /* /* * We observed an active task for this mm, th * We observed an active task for this mm, th * no point in moving this cid to the destina * no point in moving this cid to the destina */ */ t->last_mm_cid = -1; t->last_mm_cid = -1; return -1; return -1; } } rcu_read_unlock(); rcu_read_unlock(); /* /* * The src_cid is unused, so it can be unset. * The src_cid is unused, so it can be unset. */ */ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CI if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CI return -1; return -1; return src_cid; return src_cid; } } /* /* * Migration to dst cpu. Called with dst_rq lock held. * Migration to dst cpu. Called with dst_rq lock held. * Interrupts are disabled, which keeps the window of cid own * Interrupts are disabled, which keeps the window of cid own * source rq lock held small. * source rq lock held small. */ */ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_s void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_s { { struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_cid *src_pcpu_cid, *dst_pcpu_cid; struct mm_struct *mm = t->mm; struct mm_struct *mm = t->mm; int src_cid, dst_cid, src_cpu; int src_cid, dst_cid, src_cpu; struct rq *src_rq; struct rq *src_rq; lockdep_assert_rq_held(dst_rq); lockdep_assert_rq_held(dst_rq); if (!mm) if (!mm) return; return; src_cpu = t->migrate_from_cpu; src_cpu = t->migrate_from_cpu; if (src_cpu == -1) { if (src_cpu == -1) { t->last_mm_cid = -1; t->last_mm_cid = -1; return; return; } } /* /* * Move the src cid if the dst cid is unset. This kee * Move the src cid if the dst cid is unset. This kee * allocation closest to 0 in cases where few threads * allocation closest to 0 in cases where few threads * many cpus. * many cpus. * * * If destination cid is already set, we may have to * If destination cid is already set, we may have to * the src cid to ensure compactness in frequent migr * the src cid to ensure compactness in frequent migr * scenarios. * scenarios. * * * It is not useful to clear the src cid when the num * It is not useful to clear the src cid when the num * greater or equal to the number of allowed cpus, be * greater or equal to the number of allowed cpus, be * can expect that the number of allowed cids can rea * can expect that the number of allowed cids can rea * allowed cpus. * allowed cpus. */ */ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_r dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_r dst_cid = READ_ONCE(dst_pcpu_cid->cid); dst_cid = READ_ONCE(dst_pcpu_cid->cid); if (!mm_cid_is_unset(dst_cid) && if (!mm_cid_is_unset(dst_cid) && atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) atomic_read(&mm->mm_users) >= t->nr_cpus_allowed) return; return; src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu); src_rq = cpu_rq(src_cpu); src_rq = cpu_rq(src_cpu); src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_r src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_r if (src_cid == -1) if (src_cid == -1) return; return; src_cid = __sched_mm_cid_migrate_from_try_steal_cid(s src_cid = __sched_mm_cid_migrate_from_try_steal_cid(s s s if (src_cid == -1) if (src_cid == -1) return; return; if (!mm_cid_is_unset(dst_cid)) { if (!mm_cid_is_unset(dst_cid)) { __mm_cid_put(mm, src_cid); __mm_cid_put(mm, src_cid); return; return; } } /* Move src_cid to dst cpu. */ /* Move src_cid to dst cpu. */ mm_cid_snapshot_time(dst_rq, mm); mm_cid_snapshot_time(dst_rq, mm); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); WRITE_ONCE(dst_pcpu_cid->cid, src_cid); } } static void sched_mm_cid_remote_clear(struct mm_struct *mm, s static void sched_mm_cid_remote_clear(struct mm_struct *mm, s int cpu) int cpu) { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct task_struct *t; struct task_struct *t; unsigned long flags; unsigned long flags; int cid, lazy_cid; int cid, lazy_cid; cid = READ_ONCE(pcpu_cid->cid); cid = READ_ONCE(pcpu_cid->cid); if (!mm_cid_is_valid(cid)) if (!mm_cid_is_valid(cid)) return; return; /* /* * Clear the cpu cid if it is set to keep cid allocat * Clear the cpu cid if it is set to keep cid allocat * there happens to be other tasks left on the source * there happens to be other tasks left on the source * mm, the next task using this mm will reallocate it * mm, the next task using this mm will reallocate it * switch. * switch. */ */ lazy_cid = mm_cid_set_lazy_put(cid); lazy_cid = mm_cid_set_lazy_put(cid); if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid)) return; return; /* /* * The implicit barrier after cmpxchg per-mm/cpu cid * The implicit barrier after cmpxchg per-mm/cpu cid * rq->curr->mm matches the scheduler barrier in cont * rq->curr->mm matches the scheduler barrier in cont * between store to rq->curr and load of prev and nex * between store to rq->curr and load of prev and nex * per-mm/cpu cid. * per-mm/cpu cid. * * * The implicit barrier after cmpxchg per-mm/cpu cid * The implicit barrier after cmpxchg per-mm/cpu cid * rq->curr->mm_cid_active matches the barrier in * rq->curr->mm_cid_active matches the barrier in * sched_mm_cid_exit_signals(), sched_mm_cid_before_e * sched_mm_cid_exit_signals(), sched_mm_cid_before_e * sched_mm_cid_after_execve() between store to t->mm * sched_mm_cid_after_execve() between store to t->mm * load of per-mm/cpu cid. * load of per-mm/cpu cid. */ */ /* /* * If we observe an active task using the mm on this * If we observe an active task using the mm on this * the lazy-put flag, that task will be responsible f * the lazy-put flag, that task will be responsible f * from lazy-put flag set to MM_CID_UNSET. * from lazy-put flag set to MM_CID_UNSET. */ */ rcu_read_lock(); rcu_read_lock(); t = rcu_dereference(rq->curr); t = rcu_dereference(rq->curr); if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { rcu_read_unlock(); rcu_read_unlock(); return; return; } } rcu_read_unlock(); rcu_read_unlock(); /* /* * The cid is unused, so it can be unset. * The cid is unused, so it can be unset. * Disable interrupts to keep the window of cid owner * Disable interrupts to keep the window of cid owner * lock small. * lock small. */ */ local_irq_save(flags); local_irq_save(flags); if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNS if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNS __mm_cid_put(mm, cid); __mm_cid_put(mm, cid); local_irq_restore(flags); local_irq_restore(flags); } } static void sched_mm_cid_remote_clear_old(struct mm_struct *m static void sched_mm_cid_remote_clear_old(struct mm_struct *m { { struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu); struct mm_cid *pcpu_cid; struct mm_cid *pcpu_cid; struct task_struct *curr; struct task_struct *curr; u64 rq_clock; u64 rq_clock; /* /* * rq->clock load is racy on 32-bit but one spurious * rq->clock load is racy on 32-bit but one spurious * while is irrelevant. * while is irrelevant. */ */ rq_clock = READ_ONCE(rq->clock); rq_clock = READ_ONCE(rq->clock); pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); /* /* * In order to take care of infrequently scheduled ta * In order to take care of infrequently scheduled ta * snapshot associated with this cid if an active tas * snapshot associated with this cid if an active tas * observed on this rq. * observed on this rq. */ */ rcu_read_lock(); rcu_read_lock(); curr = rcu_dereference(rq->curr); curr = rcu_dereference(rq->curr); if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) WRITE_ONCE(pcpu_cid->time, rq_clock); WRITE_ONCE(pcpu_cid->time, rq_clock); rcu_read_unlock(); rcu_read_unlock(); return; return; } } rcu_read_unlock(); rcu_read_unlock(); if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_N if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_N return; return; sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); } } static void sched_mm_cid_remote_clear_weight(struct mm_struct static void sched_mm_cid_remote_clear_weight(struct mm_struct int weight) int weight) { { struct mm_cid *pcpu_cid; struct mm_cid *pcpu_cid; int cid; int cid; pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu); cid = READ_ONCE(pcpu_cid->cid); cid = READ_ONCE(pcpu_cid->cid); if (!mm_cid_is_valid(cid) || cid < weight) if (!mm_cid_is_valid(cid) || cid < weight) return; return; sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); sched_mm_cid_remote_clear(mm, pcpu_cid, cpu); } } static void task_mm_cid_work(struct callback_head *work) static void task_mm_cid_work(struct callback_head *work) { { unsigned long now = jiffies, old_scan, next_scan; unsigned long now = jiffies, old_scan, next_scan; struct task_struct *t = current; struct task_struct *t = current; struct cpumask *cidmask; struct cpumask *cidmask; struct mm_struct *mm; struct mm_struct *mm; int weight, cpu; int weight, cpu; SCHED_WARN_ON(t != container_of(work, struct task_str SCHED_WARN_ON(t != container_of(work, struct task_str work->next = work; /* Prevent double-add */ work->next = work; /* Prevent double-add */ if (t->flags & PF_EXITING) if (t->flags & PF_EXITING) return; return; mm = t->mm; mm = t->mm; if (!mm) if (!mm) return; return; old_scan = READ_ONCE(mm->mm_cid_next_scan); old_scan = READ_ONCE(mm->mm_cid_next_scan); next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY) next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY) if (!old_scan) { if (!old_scan) { unsigned long res; unsigned long res; res = cmpxchg(&mm->mm_cid_next_scan, old_scan res = cmpxchg(&mm->mm_cid_next_scan, old_scan if (res != old_scan) if (res != old_scan) old_scan = res; old_scan = res; else else old_scan = next_scan; old_scan = next_scan; } } if (time_before(now, old_scan)) if (time_before(now, old_scan)) return; return; if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, ne if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, ne return; return; cidmask = mm_cidmask(mm); cidmask = mm_cidmask(mm); /* Clear cids that were not recently used. */ /* Clear cids that were not recently used. */ for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) sched_mm_cid_remote_clear_old(mm, cpu); sched_mm_cid_remote_clear_old(mm, cpu); weight = cpumask_weight(cidmask); weight = cpumask_weight(cidmask); /* /* * Clear cids that are greater or equal to the cidmas * Clear cids that are greater or equal to the cidmas * recompact it. * recompact it. */ */ for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) sched_mm_cid_remote_clear_weight(mm, cpu, wei sched_mm_cid_remote_clear_weight(mm, cpu, wei } } void init_sched_mm_cid(struct task_struct *t) void init_sched_mm_cid(struct task_struct *t) { { struct mm_struct *mm = t->mm; struct mm_struct *mm = t->mm; int mm_users = 0; int mm_users = 0; if (mm) { if (mm) { mm_users = atomic_read(&mm->mm_users); mm_users = atomic_read(&mm->mm_users); if (mm_users == 1) if (mm_users == 1) mm->mm_cid_next_scan = jiffies + msec mm->mm_cid_next_scan = jiffies + msec } } t->cid_work.next = &t->cid_work; /* Protect ag t->cid_work.next = &t->cid_work; /* Protect ag init_task_work(&t->cid_work, task_mm_cid_work); init_task_work(&t->cid_work, task_mm_cid_work); } } void task_tick_mm_cid(struct rq *rq, struct task_struct *curr void task_tick_mm_cid(struct rq *rq, struct task_struct *curr { { struct callback_head *work = &curr->cid_work; struct callback_head *work = &curr->cid_work; unsigned long now = jiffies; unsigned long now = jiffies; if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHR if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHR work->next != work) work->next != work) return; return; if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_ if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_ return; return; task_work_add(curr, work, TWA_RESUME); task_work_add(curr, work, TWA_RESUME); } } void sched_mm_cid_exit_signals(struct task_struct *t) void sched_mm_cid_exit_signals(struct task_struct *t) { { struct mm_struct *mm = t->mm; struct mm_struct *mm = t->mm; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; if (!mm) if (!mm) return; return; preempt_disable(); preempt_disable(); rq = this_rq(); rq = this_rq(); rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); preempt_enable_no_resched(); /* holding spinlock * preempt_enable_no_resched(); /* holding spinlock * WRITE_ONCE(t->mm_cid_active, 0); WRITE_ONCE(t->mm_cid_active, 0); /* /* * Store t->mm_cid_active before loading per-mm/cpu c * Store t->mm_cid_active before loading per-mm/cpu c * Matches barrier in sched_mm_cid_remote_clear_old() * Matches barrier in sched_mm_cid_remote_clear_old() */ */ smp_mb(); smp_mb(); mm_cid_put(mm); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; t->last_mm_cid = t->mm_cid = -1; rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); } } void sched_mm_cid_before_execve(struct task_struct *t) void sched_mm_cid_before_execve(struct task_struct *t) { { struct mm_struct *mm = t->mm; struct mm_struct *mm = t->mm; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; if (!mm) if (!mm) return; return; preempt_disable(); preempt_disable(); rq = this_rq(); rq = this_rq(); rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); preempt_enable_no_resched(); /* holding spinlock * preempt_enable_no_resched(); /* holding spinlock * WRITE_ONCE(t->mm_cid_active, 0); WRITE_ONCE(t->mm_cid_active, 0); /* /* * Store t->mm_cid_active before loading per-mm/cpu c * Store t->mm_cid_active before loading per-mm/cpu c * Matches barrier in sched_mm_cid_remote_clear_old() * Matches barrier in sched_mm_cid_remote_clear_old() */ */ smp_mb(); smp_mb(); mm_cid_put(mm); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; t->last_mm_cid = t->mm_cid = -1; rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); } } void sched_mm_cid_after_execve(struct task_struct *t) void sched_mm_cid_after_execve(struct task_struct *t) { { struct mm_struct *mm = t->mm; struct mm_struct *mm = t->mm; struct rq_flags rf; struct rq_flags rf; struct rq *rq; struct rq *rq; if (!mm) if (!mm) return; return; preempt_disable(); preempt_disable(); rq = this_rq(); rq = this_rq(); rq_lock_irqsave(rq, &rf); rq_lock_irqsave(rq, &rf); preempt_enable_no_resched(); /* holding spinlock * preempt_enable_no_resched(); /* holding spinlock * WRITE_ONCE(t->mm_cid_active, 1); WRITE_ONCE(t->mm_cid_active, 1); /* /* * Store t->mm_cid_active before loading per-mm/cpu c * Store t->mm_cid_active before loading per-mm/cpu c * Matches barrier in sched_mm_cid_remote_clear_old() * Matches barrier in sched_mm_cid_remote_clear_old() */ */ smp_mb(); smp_mb(); t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); rq_unlock_irqrestore(rq, &rf); rq_unlock_irqrestore(rq, &rf); rseq_set_notify_resume(t); rseq_set_notify_resume(t); } } void sched_mm_cid_fork(struct task_struct *t) void sched_mm_cid_fork(struct task_struct *t) { { WARN_ON_ONCE(!t->mm || t->mm_cid != -1); WARN_ON_ONCE(!t->mm || t->mm_cid != -1); t->mm_cid_active = 1; t->mm_cid_active = 1; } } #endif / #endif