// SPDX-License-Identifier: GPL-2.0-only			// SPDX-License-Identifier: GPL-2.0-only
/*								/*
 *  kernel/sched/core.c						 *  kernel/sched/core.c
 *								 *
 *  Core kernel scheduler code and related syscalls		 *  Core kernel scheduler code and related syscalls
 *								 *
 *  Copyright (C) 1991-2002  Linus Torvalds			 *  Copyright (C) 1991-2002  Linus Torvalds
 */								 */
#include <linux/highmem.h>					#include <linux/highmem.h>
#include <linux/hrtimer_api.h>					#include <linux/hrtimer_api.h>
#include <linux/ktime_api.h>					#include <linux/ktime_api.h>
#include <linux/sched/signal.h>					#include <linux/sched/signal.h>
#include <linux/syscalls_api.h>					#include <linux/syscalls_api.h>
#include <linux/debug_locks.h>					#include <linux/debug_locks.h>
#include <linux/prefetch.h>					#include <linux/prefetch.h>
#include <linux/capability.h>					#include <linux/capability.h>
#include <linux/pgtable_api.h>					#include <linux/pgtable_api.h>
#include <linux/wait_bit.h>					#include <linux/wait_bit.h>
#include <linux/jiffies.h>					#include <linux/jiffies.h>
#include <linux/spinlock_api.h>					#include <linux/spinlock_api.h>
#include <linux/cpumask_api.h>					#include <linux/cpumask_api.h>
#include <linux/lockdep_api.h>					#include <linux/lockdep_api.h>
#include <linux/hardirq.h>					#include <linux/hardirq.h>
#include <linux/softirq.h>					#include <linux/softirq.h>
#include <linux/refcount_api.h>					#include <linux/refcount_api.h>
#include <linux/topology.h>					#include <linux/topology.h>
#include <linux/sched/clock.h>					#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>				#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>				#include <linux/sched/cputime.h>
#include <linux/sched/debug.h>					#include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>				#include <linux/sched/hotplug.h>
#include <linux/sched/init.h>					#include <linux/sched/init.h>
#include <linux/sched/isolation.h>				#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>				#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>					#include <linux/sched/mm.h>
#include <linux/sched/nohz.h>					#include <linux/sched/nohz.h>
#include <linux/sched/rseq_api.h>				#include <linux/sched/rseq_api.h>
#include <linux/sched/rt.h>					#include <linux/sched/rt.h>

#include <linux/blkdev.h>					#include <linux/blkdev.h>
#include <linux/context_tracking.h>				#include <linux/context_tracking.h>
#include <linux/cpuset.h>					#include <linux/cpuset.h>
#include <linux/delayacct.h>					#include <linux/delayacct.h>
#include <linux/init_task.h>					#include <linux/init_task.h>
#include <linux/interrupt.h>					#include <linux/interrupt.h>
#include <linux/ioprio.h>					#include <linux/ioprio.h>
#include <linux/kallsyms.h>					#include <linux/kallsyms.h>
#include <linux/kcov.h>						#include <linux/kcov.h>
#include <linux/kprobes.h>					#include <linux/kprobes.h>
#include <linux/llist_api.h>					#include <linux/llist_api.h>
#include <linux/mmu_context.h>					#include <linux/mmu_context.h>
#include <linux/mmzone.h>					#include <linux/mmzone.h>
#include <linux/mutex_api.h>					#include <linux/mutex_api.h>
#include <linux/nmi.h>						#include <linux/nmi.h>
#include <linux/nospec.h>					#include <linux/nospec.h>
#include <linux/perf_event_api.h>				#include <linux/perf_event_api.h>
#include <linux/profile.h>					#include <linux/profile.h>
#include <linux/psi.h>						#include <linux/psi.h>
#include <linux/rcuwait_api.h>					#include <linux/rcuwait_api.h>
#include <linux/sched/wake_q.h>					#include <linux/sched/wake_q.h>
#include <linux/scs.h>						#include <linux/scs.h>
#include <linux/slab.h>						#include <linux/slab.h>
#include <linux/syscalls.h>					#include <linux/syscalls.h>
#include <linux/vtime.h>					#include <linux/vtime.h>
#include <linux/wait_api.h>					#include <linux/wait_api.h>
#include <linux/workqueue_api.h>				#include <linux/workqueue_api.h>

#ifdef CONFIG_PREEMPT_DYNAMIC					#ifdef CONFIG_PREEMPT_DYNAMIC
# ifdef CONFIG_GENERIC_ENTRY					# ifdef CONFIG_GENERIC_ENTRY
#  include <linux/entry-common.h>				#  include <linux/entry-common.h>
# endif								# endif
#endif								#endif

#include <uapi/linux/sched/types.h>				#include <uapi/linux/sched/types.h>

#include <asm/irq_regs.h>					#include <asm/irq_regs.h>
#include <asm/switch_to.h>					#include <asm/switch_to.h>
#include <asm/tlb.h>						#include <asm/tlb.h>

#define CREATE_TRACE_POINTS					#define CREATE_TRACE_POINTS
#include <linux/sched/rseq_api.h>				#include <linux/sched/rseq_api.h>
#include <trace/events/sched.h>					#include <trace/events/sched.h>
#include <trace/events/ipi.h>					#include <trace/events/ipi.h>
#undef CREATE_TRACE_POINTS					#undef CREATE_TRACE_POINTS

#include "sched.h"						#include "sched.h"
#include "stats.h"						#include "stats.h"
#include "autogroup.h"						#include "autogroup.h"

#include "autogroup.h"						#include "autogroup.h"
#include "pelt.h"						#include "pelt.h"
#include "smp.h"						#include "smp.h"
#include "stats.h"						#include "stats.h"

#include "../workqueue_internal.h"				#include "../workqueue_internal.h"
#include "../../io_uring/io-wq.h"				#include "../../io_uring/io-wq.h"
#include "../smpboot.h"						#include "../smpboot.h"

EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);			EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu);
EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);			EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask);

/*								/*
 * Export tracepoints that act as a bare tracehook (ie: have 	 * Export tracepoints that act as a bare tracehook (ie: have 
 * associated with them) to allow external modules to probe t	 * associated with them) to allow external modules to probe t
 */								 */
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);			EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);			EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);			EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);			EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);			EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);			EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);		EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);		EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);		EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);		EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);	EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);

DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);		DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);

#ifdef CONFIG_SCHED_DEBUG					#ifdef CONFIG_SCHED_DEBUG
/*								/*
 * Debugging: various feature bits				 * Debugging: various feature bits
 *								 *
 * If SCHED_DEBUG is disabled, each compilation unit has its 	 * If SCHED_DEBUG is disabled, each compilation unit has its 
 * sysctl_sched_features, defined in sched.h, to allow consta	 * sysctl_sched_features, defined in sched.h, to allow consta
 * at compile time and compiler optimization based on feature	 * at compile time and compiler optimization based on feature
 */								 */
#define SCHED_FEAT(name, enabled)	\			#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |			(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =		const_debug unsigned int sysctl_sched_features =
#include "features.h"						#include "features.h"
	0;								0;
#undef SCHED_FEAT						#undef SCHED_FEAT

/*								/*
 * Print a warning if need_resched is set for the given durat	 * Print a warning if need_resched is set for the given durat
 * LATENCY_WARN is enabled).					 * LATENCY_WARN is enabled).
 *								 *
 * If sysctl_resched_latency_warn_once is set, only one warni	 * If sysctl_resched_latency_warn_once is set, only one warni
 * per boot.							 * per boot.
 */								 */
__read_mostly int sysctl_resched_latency_warn_ms = 100;		__read_mostly int sysctl_resched_latency_warn_ms = 100;
__read_mostly int sysctl_resched_latency_warn_once = 1;		__read_mostly int sysctl_resched_latency_warn_once = 1;
#endif /* CONFIG_SCHED_DEBUG */					#endif /* CONFIG_SCHED_DEBUG */

/*								/*
 * Number of tasks to iterate in a single balance run.		 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.		 * Limited because this is done with IRQs disabled.
 */								 */
const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_M	const_debug unsigned int sysctl_sched_nr_migrate = SCHED_NR_M

__read_mostly int scheduler_running;				__read_mostly int scheduler_running;

#ifdef CONFIG_SCHED_CORE					#ifdef CONFIG_SCHED_CORE

DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);			DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);

/* kernel prio, less is more */					/* kernel prio, less is more */
static inline int __task_prio(const struct task_struct *p)	static inline int __task_prio(const struct task_struct *p)
{								{
	if (p->sched_class == &stop_sched_class) /* trumps de		if (p->sched_class == &stop_sched_class) /* trumps de
		return -2;							return -2;

	if (rt_prio(p->prio)) /* includes deadline */			if (rt_prio(p->prio)) /* includes deadline */
		return p->prio; /* [-1, 99] */					return p->prio; /* [-1, 99] */

	if (p->sched_class == &idle_sched_class)			if (p->sched_class == &idle_sched_class)
		return MAX_RT_PRIO + NICE_WIDTH; /* 140 */			return MAX_RT_PRIO + NICE_WIDTH; /* 140 */

	return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */		return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
}								}

/*								/*
 * l(a,b)							 * l(a,b)
 * le(a,b) := !l(b,a)						 * le(a,b) := !l(b,a)
 * g(a,b)  := l(b,a)						 * g(a,b)  := l(b,a)
 * ge(a,b) := !l(a,b)						 * ge(a,b) := !l(a,b)
 */								 */

/* real prio, less is less */					/* real prio, less is less */
static inline bool prio_less(const struct task_struct *a,	static inline bool prio_less(const struct task_struct *a,
			     const struct task_struct *b, boo				     const struct task_struct *b, boo
{								{

	int pa = __task_prio(a), pb = __task_prio(b);			int pa = __task_prio(a), pb = __task_prio(b);

	if (-pa < -pb)							if (-pa < -pb)
		return true;							return true;

	if (-pb < -pa)							if (-pb < -pa)
		return false;							return false;

	if (pa == -1) /* dl_prio() doesn't work because of st		if (pa == -1) /* dl_prio() doesn't work because of st
		return !dl_time_before(a->dl.deadline, b->dl.			return !dl_time_before(a->dl.deadline, b->dl.

	if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */		if (pa == MAX_RT_PRIO + MAX_NICE)	/* fair */
		return cfs_prio_less(a, b, in_fi);				return cfs_prio_less(a, b, in_fi);

	return false;							return false;
}								}

static inline bool __sched_core_less(const struct task_struct	static inline bool __sched_core_less(const struct task_struct
				     const struct task_struct					     const struct task_struct
{								{
	if (a->core_cookie < b->core_cookie)				if (a->core_cookie < b->core_cookie)
		return true;							return true;

	if (a->core_cookie > b->core_cookie)				if (a->core_cookie > b->core_cookie)
		return false;							return false;

	/* flip prio, so high prio is leftmost */			/* flip prio, so high prio is leftmost */
	if (prio_less(b, a, !!task_rq(a)->core->core_forceidl		if (prio_less(b, a, !!task_rq(a)->core->core_forceidl
		return true;							return true;

	return false;							return false;
}								}

#define __node_2_sc(node) rb_entry((node), struct task_struct	#define __node_2_sc(node) rb_entry((node), struct task_struct

static inline bool rb_sched_core_less(struct rb_node *a, cons	static inline bool rb_sched_core_less(struct rb_node *a, cons
{								{
	return __sched_core_less(__node_2_sc(a), __node_2_sc(		return __sched_core_less(__node_2_sc(a), __node_2_sc(
}								}

static inline int rb_sched_core_cmp(const void *key, const st	static inline int rb_sched_core_cmp(const void *key, const st
{								{
	const struct task_struct *p = __node_2_sc(node);		const struct task_struct *p = __node_2_sc(node);
	unsigned long cookie = (unsigned long)key;			unsigned long cookie = (unsigned long)key;

	if (cookie < p->core_cookie)					if (cookie < p->core_cookie)
		return -1;							return -1;

	if (cookie > p->core_cookie)					if (cookie > p->core_cookie)
		return 1;							return 1;

	return 0;							return 0;
}								}

void sched_core_enqueue(struct rq *rq, struct task_struct *p)	void sched_core_enqueue(struct rq *rq, struct task_struct *p)
{								{
	rq->core->core_task_seq++;					rq->core->core_task_seq++;

	if (!p->core_cookie)						if (!p->core_cookie)
		return;								return;

	rb_add(&p->core_node, &rq->core_tree, rb_sched_core_l		rb_add(&p->core_node, &rq->core_tree, rb_sched_core_l
}								}

void sched_core_dequeue(struct rq *rq, struct task_struct *p,	void sched_core_dequeue(struct rq *rq, struct task_struct *p,
{								{
	rq->core->core_task_seq++;					rq->core->core_task_seq++;

	if (sched_core_enqueued(p)) {					if (sched_core_enqueued(p)) {
		rb_erase(&p->core_node, &rq->core_tree);			rb_erase(&p->core_node, &rq->core_tree);
		RB_CLEAR_NODE(&p->core_node);					RB_CLEAR_NODE(&p->core_node);
	}								}

	/*								/*
	 * Migrating the last task off the cpu, with the cpu 		 * Migrating the last task off the cpu, with the cpu 
	 * state. Reschedule to create an accounting edge for		 * state. Reschedule to create an accounting edge for
	 * and re-examine whether the core is still in forced		 * and re-examine whether the core is still in forced
	 */								 */
	if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&		if (!(flags & DEQUEUE_SAVE) && rq->nr_running == 1 &&
	    rq->core->core_forceidle_count && rq->curr == rq-		    rq->core->core_forceidle_count && rq->curr == rq-
		resched_curr(rq);						resched_curr(rq);
}								}

static int sched_task_is_throttled(struct task_struct *p, int	static int sched_task_is_throttled(struct task_struct *p, int
{								{
	if (p->sched_class->task_is_throttled)				if (p->sched_class->task_is_throttled)
		return p->sched_class->task_is_throttled(p, c			return p->sched_class->task_is_throttled(p, c

	return 0;							return 0;
}								}

static struct task_struct *sched_core_next(struct task_struct	static struct task_struct *sched_core_next(struct task_struct
{								{
	struct rb_node *node = &p->core_node;				struct rb_node *node = &p->core_node;
	int cpu = task_cpu(p);						int cpu = task_cpu(p);

	do {								do {
		node = rb_next(node);						node = rb_next(node);
		if (!node)							if (!node)
			return NULL;							return NULL;

		p = __node_2_sc(node);						p = __node_2_sc(node);
		if (p->core_cookie != cookie)					if (p->core_cookie != cookie)
			return NULL;							return NULL;

	} while (sched_task_is_throttled(p, cpu));			} while (sched_task_is_throttled(p, cpu));

	return p;							return p;
}								}

/*								/*
 * Find left-most (aka, highest priority) and unthrottled tas	 * Find left-most (aka, highest priority) and unthrottled tas
 * If no suitable task is found, NULL will be returned.		 * If no suitable task is found, NULL will be returned.
 */								 */
static struct task_struct *sched_core_find(struct rq *rq, uns	static struct task_struct *sched_core_find(struct rq *rq, uns
{								{
	struct task_struct *p;						struct task_struct *p;
	struct rb_node *node;						struct rb_node *node;

	node = rb_find_first((void *)cookie, &rq->core_tree, 		node = rb_find_first((void *)cookie, &rq->core_tree, 
	if (!node)							if (!node)
		return NULL;							return NULL;

	p = __node_2_sc(node);						p = __node_2_sc(node);
	if (!sched_task_is_throttled(p, rq->cpu))			if (!sched_task_is_throttled(p, rq->cpu))
		return p;							return p;

	return sched_core_next(p, cookie);				return sched_core_next(p, cookie);
}								}

/*								/*
 * Magic required such that:					 * Magic required such that:
 *								 *
 *	raw_spin_rq_lock(rq);					 *	raw_spin_rq_lock(rq);
 *	...							 *	...
 *	raw_spin_rq_unlock(rq);					 *	raw_spin_rq_unlock(rq);
 *								 *
 * ends up locking and unlocking the _same_ lock, and all CPU	 * ends up locking and unlocking the _same_ lock, and all CPU
 * always agree on what rq has what lock.			 * always agree on what rq has what lock.
 *								 *
 * XXX entirely possible to selectively enable cores, don't b	 * XXX entirely possible to selectively enable cores, don't b
 */								 */

static DEFINE_MUTEX(sched_core_mutex);				static DEFINE_MUTEX(sched_core_mutex);
static atomic_t sched_core_count;				static atomic_t sched_core_count;
static struct cpumask sched_core_mask;				static struct cpumask sched_core_mask;

static void sched_core_lock(int cpu, unsigned long *flags)	static void sched_core_lock(int cpu, unsigned long *flags)
{								{
	const struct cpumask *smt_mask = cpu_smt_mask(cpu);		const struct cpumask *smt_mask = cpu_smt_mask(cpu);
	int t, i = 0;							int t, i = 0;

	local_irq_save(*flags);						local_irq_save(*flags);
	for_each_cpu(t, smt_mask)					for_each_cpu(t, smt_mask)
		raw_spin_lock_nested(&cpu_rq(t)->__lock, i++)			raw_spin_lock_nested(&cpu_rq(t)->__lock, i++)
}								}

static void sched_core_unlock(int cpu, unsigned long *flags)	static void sched_core_unlock(int cpu, unsigned long *flags)
{								{
	const struct cpumask *smt_mask = cpu_smt_mask(cpu);		const struct cpumask *smt_mask = cpu_smt_mask(cpu);
	int t;								int t;

	for_each_cpu(t, smt_mask)					for_each_cpu(t, smt_mask)
		raw_spin_unlock(&cpu_rq(t)->__lock);				raw_spin_unlock(&cpu_rq(t)->__lock);
	local_irq_restore(*flags);					local_irq_restore(*flags);
}								}

static void __sched_core_flip(bool enabled)			static void __sched_core_flip(bool enabled)
{								{
	unsigned long flags;						unsigned long flags;
	int cpu, t;							int cpu, t;

	cpus_read_lock();						cpus_read_lock();

	/*								/*
	 * Toggle the online cores, one by one.				 * Toggle the online cores, one by one.
	 */								 */
	cpumask_copy(&sched_core_mask, cpu_online_mask);		cpumask_copy(&sched_core_mask, cpu_online_mask);
	for_each_cpu(cpu, &sched_core_mask) {				for_each_cpu(cpu, &sched_core_mask) {
		const struct cpumask *smt_mask = cpu_smt_mask			const struct cpumask *smt_mask = cpu_smt_mask

		sched_core_lock(cpu, &flags);					sched_core_lock(cpu, &flags);

		for_each_cpu(t, smt_mask)					for_each_cpu(t, smt_mask)
			cpu_rq(t)->core_enabled = enabled;				cpu_rq(t)->core_enabled = enabled;

		cpu_rq(cpu)->core->core_forceidle_start = 0;			cpu_rq(cpu)->core->core_forceidle_start = 0;

		sched_core_unlock(cpu, &flags);					sched_core_unlock(cpu, &flags);

		cpumask_andnot(&sched_core_mask, &sched_core_			cpumask_andnot(&sched_core_mask, &sched_core_
	}								}

	/*								/*
	 * Toggle the offline CPUs.					 * Toggle the offline CPUs.
	 */								 */
	for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_onlin		for_each_cpu_andnot(cpu, cpu_possible_mask, cpu_onlin
		cpu_rq(cpu)->core_enabled = enabled;				cpu_rq(cpu)->core_enabled = enabled;

	cpus_read_unlock();						cpus_read_unlock();
}								}

static void sched_core_assert_empty(void)			static void sched_core_assert_empty(void)
{								{
	int cpu;							int cpu;

	for_each_possible_cpu(cpu)					for_each_possible_cpu(cpu)
		WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->cor			WARN_ON_ONCE(!RB_EMPTY_ROOT(&cpu_rq(cpu)->cor
}								}

static void __sched_core_enable(void)				static void __sched_core_enable(void)
{								{
	static_branch_enable(&__sched_core_enabled);			static_branch_enable(&__sched_core_enabled);
	/*								/*
	 * Ensure all previous instances of raw_spin_rq_*lock		 * Ensure all previous instances of raw_spin_rq_*lock
	 * and future ones will observe !sched_core_disabled(		 * and future ones will observe !sched_core_disabled(
	 */								 */
	synchronize_rcu();						synchronize_rcu();
	__sched_core_flip(true);					__sched_core_flip(true);
	sched_core_assert_empty();					sched_core_assert_empty();
}								}

static void __sched_core_disable(void)				static void __sched_core_disable(void)
{								{
	sched_core_assert_empty();					sched_core_assert_empty();
	__sched_core_flip(false);					__sched_core_flip(false);
	static_branch_disable(&__sched_core_enabled);			static_branch_disable(&__sched_core_enabled);
}								}

void sched_core_get(void)					void sched_core_get(void)
{								{
	if (atomic_inc_not_zero(&sched_core_count))			if (atomic_inc_not_zero(&sched_core_count))
		return;								return;

	mutex_lock(&sched_core_mutex);					mutex_lock(&sched_core_mutex);
	if (!atomic_read(&sched_core_count))				if (!atomic_read(&sched_core_count))
		__sched_core_enable();						__sched_core_enable();

	smp_mb__before_atomic();					smp_mb__before_atomic();
	atomic_inc(&sched_core_count);					atomic_inc(&sched_core_count);
	mutex_unlock(&sched_core_mutex);				mutex_unlock(&sched_core_mutex);
}								}

static void __sched_core_put(struct work_struct *work)		static void __sched_core_put(struct work_struct *work)
{								{
	if (atomic_dec_and_mutex_lock(&sched_core_count, &sch		if (atomic_dec_and_mutex_lock(&sched_core_count, &sch
		__sched_core_disable();						__sched_core_disable();
		mutex_unlock(&sched_core_mutex);				mutex_unlock(&sched_core_mutex);
	}								}
}								}

void sched_core_put(void)					void sched_core_put(void)
{								{
	static DECLARE_WORK(_work, __sched_core_put);			static DECLARE_WORK(_work, __sched_core_put);

	/*								/*
	 * "There can be only one"					 * "There can be only one"
	 *								 *
	 * Either this is the last one, or we don't actually 		 * Either this is the last one, or we don't actually 
	 * 'work'. If it is the last *again*, we rely on		 * 'work'. If it is the last *again*, we rely on
	 * WORK_STRUCT_PENDING_BIT.					 * WORK_STRUCT_PENDING_BIT.
	 */								 */
	if (!atomic_add_unless(&sched_core_count, -1, 1))		if (!atomic_add_unless(&sched_core_count, -1, 1))
		schedule_work(&_work);						schedule_work(&_work);
}								}

#else /* !CONFIG_SCHED_CORE */					#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_enqueue(struct rq *rq, struct t	static inline void sched_core_enqueue(struct rq *rq, struct t
static inline void						static inline void
sched_core_dequeue(struct rq *rq, struct task_struct *p, int 	sched_core_dequeue(struct rq *rq, struct task_struct *p, int 

#endif /* CONFIG_SCHED_CORE */					#endif /* CONFIG_SCHED_CORE */

/*								/*
 * Serialization rules:						 * Serialization rules:
 *								 *
 * Lock order:							 * Lock order:
 *								 *
 *   p->pi_lock							 *   p->pi_lock
 *     rq->lock							 *     rq->lock
 *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidt	 *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidt
 *								 *
 *  rq1->lock							 *  rq1->lock
 *    rq2->lock  where: rq1 < rq2				 *    rq2->lock  where: rq1 < rq2
 *								 *
 * Regular state:						 * Regular state:
 *								 *
 * Normal scheduling state is serialized by rq->lock. __sched	 * Normal scheduling state is serialized by rq->lock. __sched
 * local CPU's rq->lock, it optionally removes the task from 	 * local CPU's rq->lock, it optionally removes the task from 
 * always looks at the local rq data structures to find the m	 * always looks at the local rq data structures to find the m
 * to run next.							 * to run next.
 *								 *
 * Task enqueue is also under rq->lock, possibly taken from a	 * Task enqueue is also under rq->lock, possibly taken from a
 * Wakeups from another LLC domain might use an IPI to transf	 * Wakeups from another LLC domain might use an IPI to transf
 * the local CPU to avoid bouncing the runqueue state around 	 * the local CPU to avoid bouncing the runqueue state around 
 * ttwu_queue_wakelist() ]					 * ttwu_queue_wakelist() ]
 *								 *
 * Task wakeup, specifically wakeups that involve migration, 	 * Task wakeup, specifically wakeups that involve migration, 
 * complicated to avoid having to take two rq->locks.		 * complicated to avoid having to take two rq->locks.
 *								 *
 * Special state:						 * Special state:
 *								 *
 * System-calls and anything external will use task_rq_lock()	 * System-calls and anything external will use task_rq_lock()
 * both p->pi_lock and rq->lock. As a consequence the state t	 * both p->pi_lock and rq->lock. As a consequence the state t
 * stable while holding either lock:				 * stable while holding either lock:
 *								 *
 *  - sched_setaffinity()/					 *  - sched_setaffinity()/
 *    set_cpus_allowed_ptr():	p->cpus_ptr, p->nr_cpus_allow	 *    set_cpus_allowed_ptr():	p->cpus_ptr, p->nr_cpus_allow
 *  - set_user_nice():		p->se.load, p->*prio		 *  - set_user_nice():		p->se.load, p->*prio
 *  - __sched_setscheduler():	p->sched_class, p->policy, p-	 *  - __sched_setscheduler():	p->sched_class, p->policy, p-
 *				p->se.load, p->rt_priority,	 *				p->se.load, p->rt_priority,
 *				p->dl.dl_{runtime, deadline, 	 *				p->dl.dl_{runtime, deadline, 
 *  - sched_setnuma():		p->numa_preferred_nid		 *  - sched_setnuma():		p->numa_preferred_nid
 *  - sched_move_task():	p->sched_task_group		 *  - sched_move_task():	p->sched_task_group
 *  - uclamp_update_active()	p->uclamp*			 *  - uclamp_update_active()	p->uclamp*
 *								 *
 * p->state <- TASK_*:						 * p->state <- TASK_*:
 *								 *
 *   is changed locklessly using set_current_state(), __set_c	 *   is changed locklessly using set_current_state(), __set_c
 *   set_special_state(), see their respective comments, or b	 *   set_special_state(), see their respective comments, or b
 *   try_to_wake_up(). This latter uses p->pi_lock to seriali	 *   try_to_wake_up(). This latter uses p->pi_lock to seriali
 *   concurrent self.						 *   concurrent self.
 *								 *
 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIG	 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIG
 *								 *
 *   is set by activate_task() and cleared by deactivate_task	 *   is set by activate_task() and cleared by deactivate_task
 *   rq->lock. Non-zero indicates the task is runnable, the s	 *   rq->lock. Non-zero indicates the task is runnable, the s
 *   ON_RQ_MIGRATING state is used for migration without hold	 *   ON_RQ_MIGRATING state is used for migration without hold
 *   rq->locks. It indicates task_cpu() is not stable, see ta	 *   rq->locks. It indicates task_cpu() is not stable, see ta
 *								 *
 * p->on_cpu <- { 0, 1 }:					 * p->on_cpu <- { 0, 1 }:
 *								 *
 *   is set by prepare_task() and cleared by finish_task() su	 *   is set by prepare_task() and cleared by finish_task() su
 *   set before p is scheduled-in and cleared after p is sche	 *   set before p is scheduled-in and cleared after p is sche
 *   under rq->lock. Non-zero indicates the task is running o	 *   under rq->lock. Non-zero indicates the task is running o
 *								 *
 *   [ The astute reader will observe that it is possible for	 *   [ The astute reader will observe that it is possible for
 *     CPU to have ->on_cpu = 1 at the same time. ]		 *     CPU to have ->on_cpu = 1 at the same time. ]
 *								 *
 * task_cpu(p): is changed by set_task_cpu(), the rules are:	 * task_cpu(p): is changed by set_task_cpu(), the rules are:
 *								 *
 *  - Don't call set_task_cpu() on a blocked task:		 *  - Don't call set_task_cpu() on a blocked task:
 *								 *
 *    We don't care what CPU we're not running on, this simpl	 *    We don't care what CPU we're not running on, this simpl
 *    the CPU assignment of blocked tasks isn't required to b	 *    the CPU assignment of blocked tasks isn't required to b
 *								 *
 *  - for try_to_wake_up(), called under p->pi_lock:		 *  - for try_to_wake_up(), called under p->pi_lock:
 *								 *
 *    This allows try_to_wake_up() to only take one rq->lock,	 *    This allows try_to_wake_up() to only take one rq->lock,
 *								 *
 *  - for migration called under rq->lock:			 *  - for migration called under rq->lock:
 *    [ see task_on_rq_migrating() in task_rq_lock() ]		 *    [ see task_on_rq_migrating() in task_rq_lock() ]
 *								 *
 *    o move_queued_task()					 *    o move_queued_task()
 *    o detach_task()						 *    o detach_task()
 *								 *
 *  - for migration called under double_rq_lock():		 *  - for migration called under double_rq_lock():
 *								 *
 *    o __migrate_swap_task()					 *    o __migrate_swap_task()
 *    o push_rt_task() / pull_rt_task()				 *    o push_rt_task() / pull_rt_task()
 *    o push_dl_task() / pull_dl_task()				 *    o push_dl_task() / pull_dl_task()
 *    o dl_task_offline_migration()				 *    o dl_task_offline_migration()
 *								 *
 */								 */

void raw_spin_rq_lock_nested(struct rq *rq, int subclass)	void raw_spin_rq_lock_nested(struct rq *rq, int subclass)
{								{
	raw_spinlock_t *lock;						raw_spinlock_t *lock;

	/* Matches synchronize_rcu() in __sched_core_enable()		/* Matches synchronize_rcu() in __sched_core_enable()
	preempt_disable();						preempt_disable();
	if (sched_core_disabled()) {					if (sched_core_disabled()) {
		raw_spin_lock_nested(&rq->__lock, subclass);			raw_spin_lock_nested(&rq->__lock, subclass);
		/* preempt_count *MUST* be > 1 */				/* preempt_count *MUST* be > 1 */
		preempt_enable_no_resched();					preempt_enable_no_resched();
		return;								return;
	}								}

	for (;;) {							for (;;) {
		lock = __rq_lockp(rq);						lock = __rq_lockp(rq);
		raw_spin_lock_nested(lock, subclass);				raw_spin_lock_nested(lock, subclass);
		if (likely(lock == __rq_lockp(rq))) {				if (likely(lock == __rq_lockp(rq))) {
			/* preempt_count *MUST* be > 1 */				/* preempt_count *MUST* be > 1 */
			preempt_enable_no_resched();					preempt_enable_no_resched();
			return;								return;
		}								}
		raw_spin_unlock(lock);						raw_spin_unlock(lock);
	}								}
}								}

bool raw_spin_rq_trylock(struct rq *rq)				bool raw_spin_rq_trylock(struct rq *rq)
{								{
	raw_spinlock_t *lock;						raw_spinlock_t *lock;
	bool ret;							bool ret;

	/* Matches synchronize_rcu() in __sched_core_enable()		/* Matches synchronize_rcu() in __sched_core_enable()
	preempt_disable();						preempt_disable();
	if (sched_core_disabled()) {					if (sched_core_disabled()) {
		ret = raw_spin_trylock(&rq->__lock);				ret = raw_spin_trylock(&rq->__lock);
		preempt_enable();						preempt_enable();
		return ret;							return ret;
	}								}

	for (;;) {							for (;;) {
		lock = __rq_lockp(rq);						lock = __rq_lockp(rq);
		ret = raw_spin_trylock(lock);					ret = raw_spin_trylock(lock);
		if (!ret || (likely(lock == __rq_lockp(rq))))			if (!ret || (likely(lock == __rq_lockp(rq))))
			preempt_enable();						preempt_enable();
			return ret;							return ret;
		}								}
		raw_spin_unlock(lock);						raw_spin_unlock(lock);
	}								}
}								}

void raw_spin_rq_unlock(struct rq *rq)				void raw_spin_rq_unlock(struct rq *rq)
{								{
	raw_spin_unlock(rq_lockp(rq));					raw_spin_unlock(rq_lockp(rq));
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
/*								/*
 * double_rq_lock - safely lock two runqueues			 * double_rq_lock - safely lock two runqueues
 */								 */
void double_rq_lock(struct rq *rq1, struct rq *rq2)		void double_rq_lock(struct rq *rq1, struct rq *rq2)
{								{
	lockdep_assert_irqs_disabled();					lockdep_assert_irqs_disabled();

	if (rq_order_less(rq2, rq1))					if (rq_order_less(rq2, rq1))
		swap(rq1, rq2);							swap(rq1, rq2);

	raw_spin_rq_lock(rq1);						raw_spin_rq_lock(rq1);
	if (__rq_lockp(rq1) != __rq_lockp(rq2))				if (__rq_lockp(rq1) != __rq_lockp(rq2))
		raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NES			raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NES

	double_rq_clock_clear_update(rq1, rq2);				double_rq_clock_clear_update(rq1, rq2);
}								}
#endif								#endif

/*								/*
 * __task_rq_lock - lock the rq @p resides on.			 * __task_rq_lock - lock the rq @p resides on.
 */								 */
struct rq *__task_rq_lock(struct task_struct *p, struct rq_fl	struct rq *__task_rq_lock(struct task_struct *p, struct rq_fl
	__acquires(rq->lock)						__acquires(rq->lock)
{								{
	struct rq *rq;							struct rq *rq;

	lockdep_assert_held(&p->pi_lock);				lockdep_assert_held(&p->pi_lock);

	for (;;) {							for (;;) {
		rq = task_rq(p);						rq = task_rq(p);
		raw_spin_rq_lock(rq);						raw_spin_rq_lock(rq);
		if (likely(rq == task_rq(p) && !task_on_rq_mi			if (likely(rq == task_rq(p) && !task_on_rq_mi
			rq_pin_lock(rq, rf);						rq_pin_lock(rq, rf);
			return rq;							return rq;
		}								}
		raw_spin_rq_unlock(rq);						raw_spin_rq_unlock(rq);

		while (unlikely(task_on_rq_migrating(p)))			while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();							cpu_relax();
	}								}
}								}

/*								/*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides 	 * task_rq_lock - lock p->pi_lock and lock the rq @p resides 
 */								 */
struct rq *task_rq_lock(struct task_struct *p, struct rq_flag	struct rq *task_rq_lock(struct task_struct *p, struct rq_flag
	__acquires(p->pi_lock)						__acquires(p->pi_lock)
	__acquires(rq->lock)						__acquires(rq->lock)
{								{
	struct rq *rq;							struct rq *rq;

	for (;;) {							for (;;) {
		raw_spin_lock_irqsave(&p->pi_lock, rf->flags)			raw_spin_lock_irqsave(&p->pi_lock, rf->flags)
		rq = task_rq(p);						rq = task_rq(p);
		raw_spin_rq_lock(rq);						raw_spin_rq_lock(rq);
		/*								/*
		 *	move_queued_task()		task_			 *	move_queued_task()		task_
		 *								 *
		 *	ACQUIRE (rq->lock)					 *	ACQUIRE (rq->lock)
		 *	[S] ->on_rq = MIGRATING		[L] r			 *	[S] ->on_rq = MIGRATING		[L] r
		 *	WMB (__set_task_cpu())		ACQUI			 *	WMB (__set_task_cpu())		ACQUI
		 *	[S] ->cpu = new_cpu		[L] t			 *	[S] ->cpu = new_cpu		[L] t
		 *					[L] -			 *					[L] -
		 *	RELEASE (rq->lock)					 *	RELEASE (rq->lock)
		 *								 *
		 * If we observe the old CPU in task_rq_lock(			 * If we observe the old CPU in task_rq_lock(
		 * the old rq->lock will fully serialize agai			 * the old rq->lock will fully serialize agai
		 *								 *
		 * If we observe the new CPU in task_rq_lock(			 * If we observe the new CPU in task_rq_lock(
		 * dependency headed by '[L] rq = task_rq()' 			 * dependency headed by '[L] rq = task_rq()' 
		 * will pair with the WMB to ensure we then a			 * will pair with the WMB to ensure we then a
		 */								 */
		if (likely(rq == task_rq(p) && !task_on_rq_mi			if (likely(rq == task_rq(p) && !task_on_rq_mi
			rq_pin_lock(rq, rf);						rq_pin_lock(rq, rf);
			return rq;							return rq;
		}								}
		raw_spin_rq_unlock(rq);						raw_spin_rq_unlock(rq);
		raw_spin_unlock_irqrestore(&p->pi_lock, rf->f			raw_spin_unlock_irqrestore(&p->pi_lock, rf->f

		while (unlikely(task_on_rq_migrating(p)))			while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();							cpu_relax();
	}								}
}								}

/*								/*
 * RQ-clock updating methods:					 * RQ-clock updating methods:
 */								 */

static void update_rq_clock_task(struct rq *rq, s64 delta)	static void update_rq_clock_task(struct rq *rq, s64 delta)
{								{
/*								/*
 * In theory, the compile should just see 0 here, and optimiz	 * In theory, the compile should just see 0 here, and optimiz
 * to sched_rt_avg_update. But I don't trust it...		 * to sched_rt_avg_update. But I don't trust it...
 */								 */
	s64 __maybe_unused steal = 0, irq_delta = 0;			s64 __maybe_unused steal = 0, irq_delta = 0;

#ifdef CONFIG_IRQ_TIME_ACCOUNTING				#ifdef CONFIG_IRQ_TIME_ACCOUNTING
	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_		irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_

	/*								/*
	 * Since irq_time is only updated on {soft,}irq_exit,		 * Since irq_time is only updated on {soft,}irq_exit,
	 * this case when a previous update_rq_clock() happen		 * this case when a previous update_rq_clock() happen
	 * {soft,}irq region.						 * {soft,}irq region.
	 *								 *
	 * When this happens, we stop ->clock_task and only u		 * When this happens, we stop ->clock_task and only u
	 * prev_irq_time stamp to account for the part that f		 * prev_irq_time stamp to account for the part that f
	 * update will consume the rest. This ensures ->clock		 * update will consume the rest. This ensures ->clock
	 * monotonic.							 * monotonic.
	 *								 *
	 * It does however cause some slight miss-attribution		 * It does however cause some slight miss-attribution
	 * time, a more accurate solution would be to update 		 * time, a more accurate solution would be to update 
	 * the current rq->clock timestamp, except that would		 * the current rq->clock timestamp, except that would
	 * atomic ops.							 * atomic ops.
	 */								 */
	if (irq_delta > delta)						if (irq_delta > delta)
		irq_delta = delta;						irq_delta = delta;

	rq->prev_irq_time += irq_delta;					rq->prev_irq_time += irq_delta;
	delta -= irq_delta;						delta -= irq_delta;
	psi_account_irqtime(rq->curr, irq_delta);			psi_account_irqtime(rq->curr, irq_delta);
	delayacct_irq(rq->curr, irq_delta);				delayacct_irq(rq->curr, irq_delta);
#endif								#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING				#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
	if (static_key_false((&paravirt_steal_rq_enabled))) {		if (static_key_false((&paravirt_steal_rq_enabled))) {
		steal = paravirt_steal_clock(cpu_of(rq));			steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;				steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))					if (unlikely(steal > delta))
			steal = delta;							steal = delta;

		rq->prev_steal_time_rq += steal;				rq->prev_steal_time_rq += steal;
		delta -= steal;							delta -= steal;
	}								}
#endif								#endif

	rq->clock_task += delta;					rq->clock_task += delta;

#ifdef CONFIG_HAVE_SCHED_AVG_IRQ				#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACIT		if ((irq_delta + steal) && sched_feat(NONTASK_CAPACIT
		update_irq_load_avg(rq, irq_delta + steal);			update_irq_load_avg(rq, irq_delta + steal);
#endif								#endif
	update_rq_clock_pelt(rq, delta);				update_rq_clock_pelt(rq, delta);
}								}

void update_rq_clock(struct rq *rq)				void update_rq_clock(struct rq *rq)
{								{
	s64 delta;							s64 delta;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	if (rq->clock_update_flags & RQCF_ACT_SKIP)			if (rq->clock_update_flags & RQCF_ACT_SKIP)
		return;								return;

#ifdef CONFIG_SCHED_DEBUG					#ifdef CONFIG_SCHED_DEBUG
	if (sched_feat(WARN_DOUBLE_CLOCK))				if (sched_feat(WARN_DOUBLE_CLOCK))
		SCHED_WARN_ON(rq->clock_update_flags & RQCF_U			SCHED_WARN_ON(rq->clock_update_flags & RQCF_U
	rq->clock_update_flags |= RQCF_UPDATED;				rq->clock_update_flags |= RQCF_UPDATED;
#endif								#endif

	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;		delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
	if (delta < 0)							if (delta < 0)
		return;								return;
	rq->clock += delta;						rq->clock += delta;
	update_rq_clock_task(rq, delta);				update_rq_clock_task(rq, delta);
}								}

#ifdef CONFIG_SCHED_HRTICK					#ifdef CONFIG_SCHED_HRTICK
/*								/*
 * Use HR-timers to deliver accurate preemption points.		 * Use HR-timers to deliver accurate preemption points.
 */								 */

static void hrtick_clear(struct rq *rq)				static void hrtick_clear(struct rq *rq)
{								{
	if (hrtimer_active(&rq->hrtick_timer))				if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);				hrtimer_cancel(&rq->hrtick_timer);
}								}

/*								/*
 * High-resolution timer tick.					 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.		 * Runs from hardirq context with interrupts disabled.
 */								 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)	static enum hrtimer_restart hrtick(struct hrtimer *timer)
{								{
	struct rq *rq = container_of(timer, struct rq, hrtick		struct rq *rq = container_of(timer, struct rq, hrtick
	struct rq_flags rf;						struct rq_flags rf;

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());			WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

	rq_lock(rq, &rf);						rq_lock(rq, &rf);
	update_rq_clock(rq);						update_rq_clock(rq);
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);		rq->curr->sched_class->task_tick(rq, rq->curr, 1);
	rq_unlock(rq, &rf);						rq_unlock(rq, &rf);

	return HRTIMER_NORESTART;					return HRTIMER_NORESTART;
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP

static void __hrtick_restart(struct rq *rq)			static void __hrtick_restart(struct rq *rq)
{								{
	struct hrtimer *timer = &rq->hrtick_timer;			struct hrtimer *timer = &rq->hrtick_timer;
	ktime_t time = rq->hrtick_time;					ktime_t time = rq->hrtick_time;

	hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HA		hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HA
}								}

/*								/*
 * called from hardirq (IPI) context				 * called from hardirq (IPI) context
 */								 */
static void __hrtick_start(void *arg)				static void __hrtick_start(void *arg)
{								{
	struct rq *rq = arg;						struct rq *rq = arg;
	struct rq_flags rf;						struct rq_flags rf;

	rq_lock(rq, &rf);						rq_lock(rq, &rf);
	__hrtick_restart(rq);						__hrtick_restart(rq);
	rq_unlock(rq, &rf);						rq_unlock(rq, &rf);
}								}

/*								/*
 * Called to set the hrtick timer state.			 * Called to set the hrtick timer state.
 *								 *
 * called with rq->lock held and irqs disabled			 * called with rq->lock held and irqs disabled
 */								 */
void hrtick_start(struct rq *rq, u64 delay)			void hrtick_start(struct rq *rq, u64 delay)
{								{
	struct hrtimer *timer = &rq->hrtick_timer;			struct hrtimer *timer = &rq->hrtick_timer;
	s64 delta;							s64 delta;

	/*								/*
	 * Don't schedule slices shorter than 10000ns, that j		 * Don't schedule slices shorter than 10000ns, that j
	 * doesn't make sense and can cause timer DoS.			 * doesn't make sense and can cause timer DoS.
	 */								 */
	delta = max_t(s64, delay, 10000LL);				delta = max_t(s64, delay, 10000LL);
	rq->hrtick_time = ktime_add_ns(timer->base->get_time(		rq->hrtick_time = ktime_add_ns(timer->base->get_time(

	if (rq == this_rq())						if (rq == this_rq())
		__hrtick_restart(rq);						__hrtick_restart(rq);
	else								else
		smp_call_function_single_async(cpu_of(rq), &r			smp_call_function_single_async(cpu_of(rq), &r
}								}

#else								#else
/*								/*
 * Called to set the hrtick timer state.			 * Called to set the hrtick timer state.
 *								 *
 * called with rq->lock held and irqs disabled			 * called with rq->lock held and irqs disabled
 */								 */
void hrtick_start(struct rq *rq, u64 delay)			void hrtick_start(struct rq *rq, u64 delay)
{								{
	/*								/*
	 * Don't schedule slices shorter than 10000ns, that j		 * Don't schedule slices shorter than 10000ns, that j
	 * doesn't make sense. Rely on vruntime for fairness.		 * doesn't make sense. Rely on vruntime for fairness.
	 */								 */
	delay = max_t(u64, delay, 10000LL);				delay = max_t(u64, delay, 10000LL);
	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),		hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
		      HRTIMER_MODE_REL_PINNED_HARD);				      HRTIMER_MODE_REL_PINNED_HARD);
}								}

#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */

static void hrtick_rq_init(struct rq *rq)			static void hrtick_rq_init(struct rq *rq)
{								{
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);			INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif								#endif
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTI		hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTI
	rq->hrtick_timer.function = hrtick;				rq->hrtick_timer.function = hrtick;
}								}
#else	/* CONFIG_SCHED_HRTICK */				#else	/* CONFIG_SCHED_HRTICK */
static inline void hrtick_clear(struct rq *rq)			static inline void hrtick_clear(struct rq *rq)
{								{
}								}

static inline void hrtick_rq_init(struct rq *rq)		static inline void hrtick_rq_init(struct rq *rq)
{								{
}								}
#endif	/* CONFIG_SCHED_HRTICK */				#endif	/* CONFIG_SCHED_HRTICK */

/*								/*
 * cmpxchg based fetch_or, macro so it works for different in	 * cmpxchg based fetch_or, macro so it works for different in
 */								 */
#define fetch_or(ptr, mask)					#define fetch_or(ptr, mask)				
	({								({						
		typeof(ptr) _ptr = (ptr);					typeof(ptr) _ptr = (ptr);		
		typeof(mask) _mask = (mask);					typeof(mask) _mask = (mask);		
		typeof(*_ptr) _val = *_ptr;					typeof(*_ptr) _val = *_ptr;		
															
		do {								do {					
		} while (!try_cmpxchg(_ptr, &_val, _val | _ma			} while (!try_cmpxchg(_ptr, &_val, _val | _ma
	_val;								_val;						
})								})

#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)		#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*								/*
 * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_PO |	 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_N
 * this avoids any races wrt polling state changes and thereb	 * this avoids any races wrt polling state changes and thereb
 * spurious IPIs.						 * spurious IPIs.
 */								 */
static inline bool set_nr_and_not_polling(struct task_struct  |	static inline bool set_nr_and_not_polling(struct task_struct 
{								{
	struct thread_info *ti = task_thread_info(p);			struct thread_info *ti = task_thread_info(p);
							      |		return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _T
	return !(fetch_or(&ti->flags, 1 << tif_bit) & _TIF_PO <
}								}

/*								/*
 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is s	 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is s
 *								 *
 * If this returns true, then the idle task promises to call	 * If this returns true, then the idle task promises to call
 * sched_ttwu_pending() and reschedule soon.			 * sched_ttwu_pending() and reschedule soon.
 */								 */
static bool set_nr_if_polling(struct task_struct *p)		static bool set_nr_if_polling(struct task_struct *p)
{								{
	struct thread_info *ti = task_thread_info(p);			struct thread_info *ti = task_thread_info(p);
	typeof(ti->flags) val = READ_ONCE(ti->flags);			typeof(ti->flags) val = READ_ONCE(ti->flags);

	for (;;) {							for (;;) {
		if (!(val & _TIF_POLLING_NRFLAG))				if (!(val & _TIF_POLLING_NRFLAG))
			return false;							return false;
		if (val & (_TIF_NEED_RESCHED | _TIF_NEED_RESC |			if (val & _TIF_NEED_RESCHED)
			return true;							return true;
		if (try_cmpxchg(&ti->flags, &val, val | _TIF_			if (try_cmpxchg(&ti->flags, &val, val | _TIF_
			break;								break;
	}								}
	return true;							return true;
}								}

#else								#else
static inline bool set_nr_and_not_polling(struct task_struct  |	static inline bool set_nr_and_not_polling(struct task_struct 
{								{
	set_tsk_thread_flag(p, tif_bit);		      |		set_tsk_need_resched(p);
	return true;							return true;
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
static inline bool set_nr_if_polling(struct task_struct *p)	static inline bool set_nr_if_polling(struct task_struct *p)
{								{
	return false;							return false;
}								}
#endif								#endif
#endif								#endif

static bool __wake_q_add(struct wake_q_head *head, struct tas	static bool __wake_q_add(struct wake_q_head *head, struct tas
{								{
	struct wake_q_node *node = &task->wake_q;			struct wake_q_node *node = &task->wake_q;

	/*								/*
	 * Atomically grab the task, if ->wake_q is !nil alre		 * Atomically grab the task, if ->wake_q is !nil alre
	 * it's already queued (either by us or someone else)		 * it's already queued (either by us or someone else)
	 * wakeup due to that.						 * wakeup due to that.
	 *								 *
	 * In order to ensure that a pending wakeup will obse		 * In order to ensure that a pending wakeup will obse
	 * state, even in the failed case, an explicit smp_mb		 * state, even in the failed case, an explicit smp_mb
	 */								 */
	smp_mb__before_atomic();					smp_mb__before_atomic();
	if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_		if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_
		return false;							return false;

	/*								/*
	 * The head is context local, there can be no concurr		 * The head is context local, there can be no concurr
	 */								 */
	*head->lastp = node;						*head->lastp = node;
	head->lastp = &node->next;					head->lastp = &node->next;
	return true;							return true;
}								}

/**								/**
 * wake_q_add() - queue a wakeup for 'later' waking.		 * wake_q_add() - queue a wakeup for 'later' waking.
 * @head: the wake_q_head to add @task to			 * @head: the wake_q_head to add @task to
 * @task: the task to queue for 'later' wakeup			 * @task: the task to queue for 'later' wakeup
 *								 *
 * Queue a task for later wakeup, most likely by the wake_up_	 * Queue a task for later wakeup, most likely by the wake_up_
 * same context, _HOWEVER_ this is not guaranteed, the wakeup	 * same context, _HOWEVER_ this is not guaranteed, the wakeup
 * instantly.							 * instantly.
 *								 *
 * This function must be used as-if it were wake_up_process()	 * This function must be used as-if it were wake_up_process()
 * must be ready to be woken at this location.			 * must be ready to be woken at this location.
 */								 */
void wake_q_add(struct wake_q_head *head, struct task_struct 	void wake_q_add(struct wake_q_head *head, struct task_struct 
{								{
	if (__wake_q_add(head, task))					if (__wake_q_add(head, task))
		get_task_struct(task);						get_task_struct(task);
}								}

/**								/**
 * wake_q_add_safe() - safely queue a wakeup for 'later' waki	 * wake_q_add_safe() - safely queue a wakeup for 'later' waki
 * @head: the wake_q_head to add @task to			 * @head: the wake_q_head to add @task to
 * @task: the task to queue for 'later' wakeup			 * @task: the task to queue for 'later' wakeup
 *								 *
 * Queue a task for later wakeup, most likely by the wake_up_	 * Queue a task for later wakeup, most likely by the wake_up_
 * same context, _HOWEVER_ this is not guaranteed, the wakeup	 * same context, _HOWEVER_ this is not guaranteed, the wakeup
 * instantly.							 * instantly.
 *								 *
 * This function must be used as-if it were wake_up_process()	 * This function must be used as-if it were wake_up_process()
 * must be ready to be woken at this location.			 * must be ready to be woken at this location.
 *								 *
 * This function is essentially a task-safe equivalent to wak	 * This function is essentially a task-safe equivalent to wak
 * that already hold reference to @task can call the 'safe' v	 * that already hold reference to @task can call the 'safe' v
 * wake_q to do the right thing depending whether or not the 	 * wake_q to do the right thing depending whether or not the 
 * queued for wakeup.						 * queued for wakeup.
 */								 */
void wake_q_add_safe(struct wake_q_head *head, struct task_st	void wake_q_add_safe(struct wake_q_head *head, struct task_st
{								{
	if (!__wake_q_add(head, task))					if (!__wake_q_add(head, task))
		put_task_struct(task);						put_task_struct(task);
}								}

void wake_up_q(struct wake_q_head *head)			void wake_up_q(struct wake_q_head *head)
{								{
	struct wake_q_node *node = head->first;				struct wake_q_node *node = head->first;

	while (node != WAKE_Q_TAIL) {					while (node != WAKE_Q_TAIL) {
		struct task_struct *task;					struct task_struct *task;

		task = container_of(node, struct task_struct,			task = container_of(node, struct task_struct,
		/* Task can safely be re-inserted now: */			/* Task can safely be re-inserted now: */
		node = node->next;						node = node->next;
		task->wake_q.next = NULL;					task->wake_q.next = NULL;

		/*								/*
		 * wake_up_process() executes a full barrier,			 * wake_up_process() executes a full barrier,
		 * the queueing in wake_q_add() so as not to 			 * the queueing in wake_q_add() so as not to 
		 */								 */
		wake_up_process(task);						wake_up_process(task);
		put_task_struct(task);						put_task_struct(task);
	}								}
}								}

/*								/*
 * resched_curr - mark rq's current task 'to be rescheduled n	 * resched_curr - mark rq's current task 'to be rescheduled n
 *								 *
 * On UP this means the setting of the need_resched flag, on 	 * On UP this means the setting of the need_resched flag, on 
 * might also involve a cross-CPU call to trigger the schedul	 * might also involve a cross-CPU call to trigger the schedul
 * the target CPU.						 * the target CPU.
 */								 */
static void __resched_curr(struct rq *rq, int lazy)	      |	void resched_curr(struct rq *rq)
{								{
	int cpu, tif_bit = TIF_NEED_RESCHED + lazy;	      <
	struct task_struct *curr = rq->curr;				struct task_struct *curr = rq->curr;
							      >		int cpu;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	if (unlikely(test_tsk_thread_flag(curr, tif_bit)))    |		if (test_tsk_need_resched(curr))
		return;								return;

	cpu = cpu_of(rq);						cpu = cpu_of(rq);

	if (cpu == smp_processor_id()) {				if (cpu == smp_processor_id()) {
		set_tsk_thread_flag(curr, tif_bit);	      |			set_tsk_need_resched(curr);
		if (!lazy)				      |			set_preempt_need_resched();
			set_preempt_need_resched();	      <
		return;								return;
	}								}

	if (set_nr_and_not_polling(curr, tif_bit)) {	      |		if (set_nr_and_not_polling(curr))
		if (!lazy)				      |			smp_send_reschedule(cpu);
			smp_send_reschedule(cpu);	      |		else
	} else {					      <
		trace_sched_wake_idle_without_ipi(cpu);				trace_sched_wake_idle_without_ipi(cpu);
	}						      <
}							      <
							      <
void resched_curr(struct rq *rq)			      <
{							      <
	__resched_curr(rq, 0);				      <
}							      <
							      <
void resched_curr_lazy(struct rq *rq)			      <
{							      <
	int lazy = IS_ENABLED(CONFIG_PREEMPT_BUILD_AUTO) && ! <
		TIF_NEED_RESCHED_LAZY_OFFSET : 0;	      <
							      <
	if (lazy && unlikely(test_tsk_thread_flag(rq->curr, T <
		return;					      <
							      <
	__resched_curr(rq, lazy);			      <
}								}

void resched_cpu(int cpu)					void resched_cpu(int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	unsigned long flags;						unsigned long flags;

	raw_spin_rq_lock_irqsave(rq, flags);				raw_spin_rq_lock_irqsave(rq, flags);
	if (cpu_online(cpu) || cpu == smp_processor_id())		if (cpu_online(cpu) || cpu == smp_processor_id())
		resched_curr(rq);						resched_curr(rq);
	raw_spin_rq_unlock_irqrestore(rq, flags);			raw_spin_rq_unlock_irqrestore(rq, flags);
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
#ifdef CONFIG_NO_HZ_COMMON					#ifdef CONFIG_NO_HZ_COMMON
/*								/*
 * In the semi idle case, use the nearest busy CPU for migrat	 * In the semi idle case, use the nearest busy CPU for migrat
 * from an idle CPU.  This is good for power-savings.		 * from an idle CPU.  This is good for power-savings.
 *								 *
 * We don't do similar optimization for completely idle syste	 * We don't do similar optimization for completely idle syste
 * selecting an idle CPU will add more delays to the timers t	 * selecting an idle CPU will add more delays to the timers t
 * (as that CPU's timer base may not be uptodate wrt jiffies 	 * (as that CPU's timer base may not be uptodate wrt jiffies 
 */								 */
int get_nohz_timer_target(void)					int get_nohz_timer_target(void)
{								{
	int i, cpu = smp_processor_id(), default_cpu = -1;		int i, cpu = smp_processor_id(), default_cpu = -1;
	struct sched_domain *sd;					struct sched_domain *sd;
	const struct cpumask *hk_mask;					const struct cpumask *hk_mask;

	if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {			if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
		if (!idle_cpu(cpu))						if (!idle_cpu(cpu))
			return cpu;							return cpu;
		default_cpu = cpu;						default_cpu = cpu;
	}								}

	hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);			hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);

	guard(rcu)();							guard(rcu)();

	for_each_domain(cpu, sd) {					for_each_domain(cpu, sd) {
		for_each_cpu_and(i, sched_domain_span(sd), hk			for_each_cpu_and(i, sched_domain_span(sd), hk
			if (cpu == i)							if (cpu == i)
				continue;							continue;

			if (!idle_cpu(i))						if (!idle_cpu(i))
				return i;							return i;
		}								}
	}								}

	if (default_cpu == -1)						if (default_cpu == -1)
		default_cpu = housekeeping_any_cpu(HK_TYPE_TI			default_cpu = housekeeping_any_cpu(HK_TYPE_TI

	return default_cpu;						return default_cpu;
}								}

/*								/*
 * When add_timer_on() enqueues a timer into the timer wheel 	 * When add_timer_on() enqueues a timer into the timer wheel 
 * idle CPU then this timer might expire before the next time	 * idle CPU then this timer might expire before the next time
 * which is scheduled to wake up that CPU. In case of a compl	 * which is scheduled to wake up that CPU. In case of a compl
 * idle system the next event might even be infinite time int	 * idle system the next event might even be infinite time int
 * future. wake_up_idle_cpu() ensures that the CPU is woken u	 * future. wake_up_idle_cpu() ensures that the CPU is woken u
 * leaves the inner idle loop so the newly added timer is tak	 * leaves the inner idle loop so the newly added timer is tak
 * account when the CPU goes back to idle and evaluates the t	 * account when the CPU goes back to idle and evaluates the t
 * wheel for the next timer event.				 * wheel for the next timer event.
 */								 */
static void wake_up_idle_cpu(int cpu)				static void wake_up_idle_cpu(int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	if (cpu == smp_processor_id())					if (cpu == smp_processor_id())
		return;								return;

	if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED |		if (set_nr_and_not_polling(rq->idle))
		smp_send_reschedule(cpu);					smp_send_reschedule(cpu);
	else								else
		trace_sched_wake_idle_without_ipi(cpu);				trace_sched_wake_idle_without_ipi(cpu);
}								}

static bool wake_up_full_nohz_cpu(int cpu)			static bool wake_up_full_nohz_cpu(int cpu)
{								{
	/*								/*
	 * We just need the target to call irq_exit() and re-		 * We just need the target to call irq_exit() and re-
	 * the next tick. The nohz full kick at least implies		 * the next tick. The nohz full kick at least implies
	 * If needed we can still optimize that later with an		 * If needed we can still optimize that later with an
	 * empty IRQ.							 * empty IRQ.
	 */								 */
	if (cpu_is_offline(cpu))					if (cpu_is_offline(cpu))
		return true;  /* Don't try to wake offline CP			return true;  /* Don't try to wake offline CP
	if (tick_nohz_full_cpu(cpu)) {					if (tick_nohz_full_cpu(cpu)) {
		if (cpu != smp_processor_id() ||				if (cpu != smp_processor_id() ||
		    tick_nohz_tick_stopped())					    tick_nohz_tick_stopped())
			tick_nohz_full_kick_cpu(cpu);					tick_nohz_full_kick_cpu(cpu);
		return true;							return true;
	}								}

	return false;							return false;
}								}

/*								/*
 * Wake up the specified CPU.  If the CPU is going offline, i	 * Wake up the specified CPU.  If the CPU is going offline, i
 * caller's responsibility to deal with the lost wakeup, for 	 * caller's responsibility to deal with the lost wakeup, for 
 * by hooking into the CPU_DEAD notifier like timers and hrti	 * by hooking into the CPU_DEAD notifier like timers and hrti
 */								 */
void wake_up_nohz_cpu(int cpu)					void wake_up_nohz_cpu(int cpu)
{								{
	if (!wake_up_full_nohz_cpu(cpu))				if (!wake_up_full_nohz_cpu(cpu))
		wake_up_idle_cpu(cpu);						wake_up_idle_cpu(cpu);
}								}

static void nohz_csd_func(void *info)				static void nohz_csd_func(void *info)
{								{
	struct rq *rq = info;						struct rq *rq = info;
	int cpu = cpu_of(rq);						int cpu = cpu_of(rq);
	unsigned int flags;						unsigned int flags;

	/*								/*
	 * Release the rq::nohz_csd.					 * Release the rq::nohz_csd.
	 */								 */
	flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEW		flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEW
	WARN_ON(!(flags & NOHZ_KICK_MASK));				WARN_ON(!(flags & NOHZ_KICK_MASK));

	rq->idle_balance = idle_cpu(cpu);				rq->idle_balance = idle_cpu(cpu);
	if (rq->idle_balance && !need_resched()) {			if (rq->idle_balance && !need_resched()) {
		rq->nohz_idle_balance = flags;					rq->nohz_idle_balance = flags;
		raise_softirq_irqoff(SCHED_SOFTIRQ);				raise_softirq_irqoff(SCHED_SOFTIRQ);
	}								}
}								}

#endif /* CONFIG_NO_HZ_COMMON */				#endif /* CONFIG_NO_HZ_COMMON */

#ifdef CONFIG_NO_HZ_FULL					#ifdef CONFIG_NO_HZ_FULL
static inline bool __need_bw_check(struct rq *rq, struct task	static inline bool __need_bw_check(struct rq *rq, struct task
{								{
	if (rq->nr_running != 1)					if (rq->nr_running != 1)
		return false;							return false;

	if (p->sched_class != &fair_sched_class)			if (p->sched_class != &fair_sched_class)
		return false;							return false;

	if (!task_on_rq_queued(p))					if (!task_on_rq_queued(p))
		return false;							return false;

	return true;							return true;
}								}

bool sched_can_stop_tick(struct rq *rq)				bool sched_can_stop_tick(struct rq *rq)
{								{
	int fifo_nr_running;						int fifo_nr_running;

	/* Deadline tasks, even if single, need the tick */		/* Deadline tasks, even if single, need the tick */
	if (rq->dl.dl_nr_running)					if (rq->dl.dl_nr_running)
		return false;							return false;

	/*								/*
	 * If there are more than one RR tasks, we need the t		 * If there are more than one RR tasks, we need the t
	 * actual RR behaviour.						 * actual RR behaviour.
	 */								 */
	if (rq->rt.rr_nr_running) {					if (rq->rt.rr_nr_running) {
		if (rq->rt.rr_nr_running == 1)					if (rq->rt.rr_nr_running == 1)
			return true;							return true;
		else								else
			return false;							return false;
	}								}

	/*								/*
	 * If there's no RR tasks, but FIFO tasks, we can ski		 * If there's no RR tasks, but FIFO tasks, we can ski
	 * forced preemption between FIFO tasks.			 * forced preemption between FIFO tasks.
	 */								 */
	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr		fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr
	if (fifo_nr_running)						if (fifo_nr_running)
		return true;							return true;

	/*								/*
	 * If there are no DL,RR/FIFO tasks, there must only 		 * If there are no DL,RR/FIFO tasks, there must only 
	 * if there's more than one we need the tick for invo		 * if there's more than one we need the tick for invo
	 * preemption.							 * preemption.
	 */								 */
	if (rq->nr_running > 1)						if (rq->nr_running > 1)
		return false;							return false;

	/*								/*
	 * If there is one task and it has CFS runtime bandwi		 * If there is one task and it has CFS runtime bandwi
	 * and it's on the cpu now we don't want to stop the 		 * and it's on the cpu now we don't want to stop the 
	 * This check prevents clearing the bit if a newly en		 * This check prevents clearing the bit if a newly en
	 * dequeued by migrating while the constrained task c		 * dequeued by migrating while the constrained task c
	 * E.g. going from 2->1 without going through pick_ne		 * E.g. going from 2->1 without going through pick_ne
	 */								 */
	if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr		if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr
		if (cfs_task_bw_constrained(rq->curr))				if (cfs_task_bw_constrained(rq->curr))
			return false;							return false;
	}								}

	return true;							return true;
}								}
#endif /* CONFIG_NO_HZ_FULL */					#endif /* CONFIG_NO_HZ_FULL */
#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */

#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GR	#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GR
			(defined(CONFIG_SMP) || defined(CONFI				(defined(CONFIG_SMP) || defined(CONFI
/*								/*
 * Iterate task_group tree rooted at *from, calling @down whe	 * Iterate task_group tree rooted at *from, calling @down whe
 * node and @up when leaving it for the final time.		 * node and @up when leaving it for the final time.
 *								 *
 * Caller must hold rcu_lock or sufficient equivalent.		 * Caller must hold rcu_lock or sufficient equivalent.
 */								 */
int walk_tg_tree_from(struct task_group *from,			int walk_tg_tree_from(struct task_group *from,
			     tg_visitor down, tg_visitor up, 				     tg_visitor down, tg_visitor up, 
{								{
	struct task_group *parent, *child;				struct task_group *parent, *child;
	int ret;							int ret;

	parent = from;							parent = from;

down:								down:
	ret = (*down)(parent, data);					ret = (*down)(parent, data);
	if (ret)							if (ret)
		goto out;							goto out;
	list_for_each_entry_rcu(child, &parent->children, sib		list_for_each_entry_rcu(child, &parent->children, sib
		parent = child;							parent = child;
		goto down;							goto down;

up:								up:
		continue;							continue;
	}								}
	ret = (*up)(parent, data);					ret = (*up)(parent, data);
	if (ret || parent == from)					if (ret || parent == from)
		goto out;							goto out;

	child = parent;							child = parent;
	parent = parent->parent;					parent = parent->parent;
	if (parent)							if (parent)
		goto up;							goto up;
out:								out:
	return ret;							return ret;
}								}

int tg_nop(struct task_group *tg, void *data)			int tg_nop(struct task_group *tg, void *data)
{								{
	return 0;							return 0;
}								}
#endif								#endif

static void set_load_weight(struct task_struct *p, bool updat	static void set_load_weight(struct task_struct *p, bool updat
{								{
	int prio = p->static_prio - MAX_RT_PRIO;			int prio = p->static_prio - MAX_RT_PRIO;
	struct load_weight *load = &p->se.load;				struct load_weight *load = &p->se.load;

	/*								/*
	 * SCHED_IDLE tasks get minimal weight:				 * SCHED_IDLE tasks get minimal weight:
	 */								 */
	if (task_has_idle_policy(p)) {					if (task_has_idle_policy(p)) {
		load->weight = scale_load(WEIGHT_IDLEPRIO);			load->weight = scale_load(WEIGHT_IDLEPRIO);
		load->inv_weight = WMULT_IDLEPRIO;				load->inv_weight = WMULT_IDLEPRIO;
		return;								return;
	}								}

	/*								/*
	 * SCHED_OTHER tasks have to update their load when c		 * SCHED_OTHER tasks have to update their load when c
	 * weight							 * weight
	 */								 */
	if (update_load && p->sched_class == &fair_sched_clas		if (update_load && p->sched_class == &fair_sched_clas
		reweight_task(p, prio);						reweight_task(p, prio);
	} else {							} else {
		load->weight = scale_load(sched_prio_to_weigh			load->weight = scale_load(sched_prio_to_weigh
		load->inv_weight = sched_prio_to_wmult[prio];			load->inv_weight = sched_prio_to_wmult[prio];
	}								}
}								}

#ifdef CONFIG_UCLAMP_TASK					#ifdef CONFIG_UCLAMP_TASK
/*								/*
 * Serializes updates of utilization clamp values		 * Serializes updates of utilization clamp values
 *								 *
 * The (slow-path) user-space triggers utilization clamp valu	 * The (slow-path) user-space triggers utilization clamp valu
 * can require updates on (fast-path) scheduler's data struct	 * can require updates on (fast-path) scheduler's data struct
 * support enqueue/dequeue operations.				 * support enqueue/dequeue operations.
 * While the per-CPU rq lock protects fast-path update operat	 * While the per-CPU rq lock protects fast-path update operat
 * requests are serialized using a mutex to reduce the risk o	 * requests are serialized using a mutex to reduce the risk o
 * updates or API abuses.					 * updates or API abuses.
 */								 */
static DEFINE_MUTEX(uclamp_mutex);				static DEFINE_MUTEX(uclamp_mutex);

/* Max allowed minimum utilization */				/* Max allowed minimum utilization */
static unsigned int __maybe_unused sysctl_sched_uclamp_util_m	static unsigned int __maybe_unused sysctl_sched_uclamp_util_m

/* Max allowed maximum utilization */				/* Max allowed maximum utilization */
static unsigned int __maybe_unused sysctl_sched_uclamp_util_m	static unsigned int __maybe_unused sysctl_sched_uclamp_util_m

/*								/*
 * By default RT tasks run at the maximum performance point/c	 * By default RT tasks run at the maximum performance point/c
 * system. Uclamp enforces this by always setting UCLAMP_MIN 	 * system. Uclamp enforces this by always setting UCLAMP_MIN 
 * SCHED_CAPACITY_SCALE.					 * SCHED_CAPACITY_SCALE.
 *								 *
 * This knob allows admins to change the default behavior whe	 * This knob allows admins to change the default behavior whe
 * used. In battery powered devices, particularly, running at	 * used. In battery powered devices, particularly, running at
 * capacity and frequency will increase energy consumption an	 * capacity and frequency will increase energy consumption an
 * battery life.						 * battery life.
 *								 *
 * This knob only affects RT tasks that their uclamp_se->user	 * This knob only affects RT tasks that their uclamp_se->user
 *								 *
 * This knob will not override the system default sched_util_	 * This knob will not override the system default sched_util_
 * above.							 * above.
 */								 */
static unsigned int sysctl_sched_uclamp_util_min_rt_default =	static unsigned int sysctl_sched_uclamp_util_min_rt_default =

/* All clamps are required to be less or equal than these val	/* All clamps are required to be less or equal than these val
static struct uclamp_se uclamp_default[UCLAMP_CNT];		static struct uclamp_se uclamp_default[UCLAMP_CNT];

/*								/*
 * This static key is used to reduce the uclamp overhead in t	 * This static key is used to reduce the uclamp overhead in t
 * primarily disables the call to uclamp_rq_{inc, dec}() in	 * primarily disables the call to uclamp_rq_{inc, dec}() in
 * enqueue/dequeue_task().					 * enqueue/dequeue_task().
 *								 *
 * This allows users to continue to enable uclamp in their ke	 * This allows users to continue to enable uclamp in their ke
 * minimum uclamp overhead in the fast path.			 * minimum uclamp overhead in the fast path.
 *								 *
 * As soon as userspace modifies any of the uclamp knobs, the	 * As soon as userspace modifies any of the uclamp knobs, the
 * enabled, since we have an actual users that make use of uc	 * enabled, since we have an actual users that make use of uc
 * functionality.						 * functionality.
 *								 *
 * The knobs that would enable this static key are:		 * The knobs that would enable this static key are:
 *								 *
 *   * A task modifying its uclamp value with sched_setattr()	 *   * A task modifying its uclamp value with sched_setattr()
 *   * An admin modifying the sysctl_sched_uclamp_{min, max} 	 *   * An admin modifying the sysctl_sched_uclamp_{min, max} 
 *   * An admin modifying the cgroup cpu.uclamp.{min, max}	 *   * An admin modifying the cgroup cpu.uclamp.{min, max}
 */								 */
DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);			DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);

/* Integer rounded range for each bucket */			/* Integer rounded range for each bucket */
#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_	#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_

#define for_each_clamp_id(clamp_id) \				#define for_each_clamp_id(clamp_id) \
	for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_		for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_

static inline unsigned int uclamp_bucket_id(unsigned int clam	static inline unsigned int uclamp_bucket_id(unsigned int clam
{								{
	return min_t(unsigned int, clamp_value / UCLAMP_BUCKE		return min_t(unsigned int, clamp_value / UCLAMP_BUCKE
}								}

static inline unsigned int uclamp_none(enum uclamp_id clamp_i	static inline unsigned int uclamp_none(enum uclamp_id clamp_i
{								{
	if (clamp_id == UCLAMP_MIN)					if (clamp_id == UCLAMP_MIN)
		return 0;							return 0;
	return SCHED_CAPACITY_SCALE;					return SCHED_CAPACITY_SCALE;
}								}

static inline void uclamp_se_set(struct uclamp_se *uc_se,	static inline void uclamp_se_set(struct uclamp_se *uc_se,
				 unsigned int value, bool use					 unsigned int value, bool use
{								{
	uc_se->value = value;						uc_se->value = value;
	uc_se->bucket_id = uclamp_bucket_id(value);			uc_se->bucket_id = uclamp_bucket_id(value);
	uc_se->user_defined = user_defined;				uc_se->user_defined = user_defined;
}								}

static inline unsigned int					static inline unsigned int
uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,	uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
		  unsigned int clamp_value)					  unsigned int clamp_value)
{								{
	/*								/*
	 * Avoid blocked utilization pushing up the frequency		 * Avoid blocked utilization pushing up the frequency
	 * idle (which drops the max-clamp) by retaining the 		 * idle (which drops the max-clamp) by retaining the 
	 * max-clamp.							 * max-clamp.
	 */								 */
	if (clamp_id == UCLAMP_MAX) {					if (clamp_id == UCLAMP_MAX) {
		rq->uclamp_flags |= UCLAMP_FLAG_IDLE;				rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
		return clamp_value;						return clamp_value;
	}								}

	return uclamp_none(UCLAMP_MIN);					return uclamp_none(UCLAMP_MIN);
}								}

static inline void uclamp_idle_reset(struct rq *rq, enum ucla	static inline void uclamp_idle_reset(struct rq *rq, enum ucla
				     unsigned int clamp_value					     unsigned int clamp_value
{								{
	/* Reset max-clamp retention only on idle exit */		/* Reset max-clamp retention only on idle exit */
	if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))			if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
		return;								return;

	uclamp_rq_set(rq, clamp_id, clamp_value);			uclamp_rq_set(rq, clamp_id, clamp_value);
}								}

static inline							static inline
unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_i	unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_i
				   unsigned int clamp_value)					   unsigned int clamp_value)
{								{
	struct uclamp_bucket *bucket = rq->uclamp[clamp_id].b		struct uclamp_bucket *bucket = rq->uclamp[clamp_id].b
	int bucket_id = UCLAMP_BUCKETS - 1;				int bucket_id = UCLAMP_BUCKETS - 1;

	/*								/*
	 * Since both min and max clamps are max aggregated, 		 * Since both min and max clamps are max aggregated, 
	 * top most bucket with tasks in.				 * top most bucket with tasks in.
	 */								 */
	for ( ; bucket_id >= 0; bucket_id--) {				for ( ; bucket_id >= 0; bucket_id--) {
		if (!bucket[bucket_id].tasks)					if (!bucket[bucket_id].tasks)
			continue;							continue;
		return bucket[bucket_id].value;					return bucket[bucket_id].value;
	}								}

	/* No tasks -- default clamp values */				/* No tasks -- default clamp values */
	return uclamp_idle_value(rq, clamp_id, clamp_value);		return uclamp_idle_value(rq, clamp_id, clamp_value);
}								}

static void __uclamp_update_util_min_rt_default(struct task_s	static void __uclamp_update_util_min_rt_default(struct task_s
{								{
	unsigned int default_util_min;					unsigned int default_util_min;
	struct uclamp_se *uc_se;					struct uclamp_se *uc_se;

	lockdep_assert_held(&p->pi_lock);				lockdep_assert_held(&p->pi_lock);

	uc_se = &p->uclamp_req[UCLAMP_MIN];				uc_se = &p->uclamp_req[UCLAMP_MIN];

	/* Only sync if user didn't override the default */		/* Only sync if user didn't override the default */
	if (uc_se->user_defined)					if (uc_se->user_defined)
		return;								return;

	default_util_min = sysctl_sched_uclamp_util_min_rt_de		default_util_min = sysctl_sched_uclamp_util_min_rt_de
	uclamp_se_set(uc_se, default_util_min, false);			uclamp_se_set(uc_se, default_util_min, false);
}								}

static void uclamp_update_util_min_rt_default(struct task_str	static void uclamp_update_util_min_rt_default(struct task_str
{								{
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	if (!rt_task(p))						if (!rt_task(p))
		return;								return;

	/* Protect updates to p->uclamp_* */				/* Protect updates to p->uclamp_* */
	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
	__uclamp_update_util_min_rt_default(p);				__uclamp_update_util_min_rt_default(p);
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
}								}

static inline struct uclamp_se					static inline struct uclamp_se
uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clam	uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clam
{								{
	/* Copy by value as we could modify it */			/* Copy by value as we could modify it */
	struct uclamp_se uc_req = p->uclamp_req[clamp_id];		struct uclamp_se uc_req = p->uclamp_req[clamp_id];
#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
	unsigned int tg_min, tg_max, value;				unsigned int tg_min, tg_max, value;

	/*								/*
	 * Tasks in autogroups or root task group will be		 * Tasks in autogroups or root task group will be
	 * restricted by system defaults.				 * restricted by system defaults.
	 */								 */
	if (task_group_is_autogroup(task_group(p)))			if (task_group_is_autogroup(task_group(p)))
		return uc_req;							return uc_req;
	if (task_group(p) == &root_task_group)				if (task_group(p) == &root_task_group)
		return uc_req;							return uc_req;

	tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;		tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
	tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;		tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
	value = uc_req.value;						value = uc_req.value;
	value = clamp(value, tg_min, tg_max);				value = clamp(value, tg_min, tg_max);
	uclamp_se_set(&uc_req, value, false);				uclamp_se_set(&uc_req, value, false);
#endif								#endif

	return uc_req;							return uc_req;
}								}

/*								/*
 * The effective clamp bucket index of a task depends on, by 	 * The effective clamp bucket index of a task depends on, by 
 * priority:							 * priority:
 * - the task specific clamp value, when explicitly requested	 * - the task specific clamp value, when explicitly requested
 * - the task group effective clamp value, for tasks not eith	 * - the task group effective clamp value, for tasks not eith
 *   group or in an autogroup					 *   group or in an autogroup
 * - the system default clamp value, defined by the sysadmin	 * - the system default clamp value, defined by the sysadmin
 */								 */
static inline struct uclamp_se					static inline struct uclamp_se
uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id	uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id
{								{
	struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp		struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp
	struct uclamp_se uc_max = uclamp_default[clamp_id];		struct uclamp_se uc_max = uclamp_default[clamp_id];

	/* System default restrictions always apply */			/* System default restrictions always apply */
	if (unlikely(uc_req.value > uc_max.value))			if (unlikely(uc_req.value > uc_max.value))
		return uc_max;							return uc_max;

	return uc_req;							return uc_req;
}								}

unsigned long uclamp_eff_value(struct task_struct *p, enum uc	unsigned long uclamp_eff_value(struct task_struct *p, enum uc
{								{
	struct uclamp_se uc_eff;					struct uclamp_se uc_eff;

	/* Task currently refcounted: use back-annotated (eff		/* Task currently refcounted: use back-annotated (eff
	if (p->uclamp[clamp_id].active)					if (p->uclamp[clamp_id].active)
		return (unsigned long)p->uclamp[clamp_id].val			return (unsigned long)p->uclamp[clamp_id].val

	uc_eff = uclamp_eff_get(p, clamp_id);				uc_eff = uclamp_eff_get(p, clamp_id);

	return (unsigned long)uc_eff.value;				return (unsigned long)uc_eff.value;
}								}

/*								/*
 * When a task is enqueued on a rq, the clamp bucket currentl	 * When a task is enqueued on a rq, the clamp bucket currentl
 * task's uclamp::bucket_id is refcounted on that rq. This al	 * task's uclamp::bucket_id is refcounted on that rq. This al
 * updates the rq's clamp value if required.			 * updates the rq's clamp value if required.
 *								 *
 * Tasks can have a task-specific value requested from user-s	 * Tasks can have a task-specific value requested from user-s
 * within each bucket the maximum value for tasks refcounted 	 * within each bucket the maximum value for tasks refcounted 
 * This "local max aggregation" allows to track the exact "re	 * This "local max aggregation" allows to track the exact "re
 * for each bucket when all its RUNNABLE tasks require the sa	 * for each bucket when all its RUNNABLE tasks require the sa
 */								 */
static inline void uclamp_rq_inc_id(struct rq *rq, struct tas	static inline void uclamp_rq_inc_id(struct rq *rq, struct tas
				    enum uclamp_id clamp_id)					    enum uclamp_id clamp_id)
{								{
	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];		struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
	struct uclamp_se *uc_se = &p->uclamp[clamp_id];			struct uclamp_se *uc_se = &p->uclamp[clamp_id];
	struct uclamp_bucket *bucket;					struct uclamp_bucket *bucket;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	/* Update task effective clamp */				/* Update task effective clamp */
	p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);		p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);

	bucket = &uc_rq->bucket[uc_se->bucket_id];			bucket = &uc_rq->bucket[uc_se->bucket_id];
	bucket->tasks++;						bucket->tasks++;
	uc_se->active = true;						uc_se->active = true;

	uclamp_idle_reset(rq, clamp_id, uc_se->value);			uclamp_idle_reset(rq, clamp_id, uc_se->value);

	/*								/*
	 * Local max aggregation: rq buckets always track the		 * Local max aggregation: rq buckets always track the
	 * "requested" clamp value of its RUNNABLE tasks.		 * "requested" clamp value of its RUNNABLE tasks.
	 */								 */
	if (bucket->tasks == 1 || uc_se->value > bucket->valu		if (bucket->tasks == 1 || uc_se->value > bucket->valu
		bucket->value = uc_se->value;					bucket->value = uc_se->value;

	if (uc_se->value > uclamp_rq_get(rq, clamp_id))			if (uc_se->value > uclamp_rq_get(rq, clamp_id))
		uclamp_rq_set(rq, clamp_id, uc_se->value);			uclamp_rq_set(rq, clamp_id, uc_se->value);
}								}

/*								/*
 * When a task is dequeued from a rq, the clamp bucket refcou	 * When a task is dequeued from a rq, the clamp bucket refcou
 * is released. If this is the last task reference counting t	 * is released. If this is the last task reference counting t
 * active clamp value, then the rq's clamp value is updated.	 * active clamp value, then the rq's clamp value is updated.
 *								 *
 * Both refcounted tasks and rq's cached clamp values are exp	 * Both refcounted tasks and rq's cached clamp values are exp
 * always valid. If it's detected they are not, as defensive 	 * always valid. If it's detected they are not, as defensive 
 * enforce the expected state and warn.				 * enforce the expected state and warn.
 */								 */
static inline void uclamp_rq_dec_id(struct rq *rq, struct tas	static inline void uclamp_rq_dec_id(struct rq *rq, struct tas
				    enum uclamp_id clamp_id)					    enum uclamp_id clamp_id)
{								{
	struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];		struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
	struct uclamp_se *uc_se = &p->uclamp[clamp_id];			struct uclamp_se *uc_se = &p->uclamp[clamp_id];
	struct uclamp_bucket *bucket;					struct uclamp_bucket *bucket;
	unsigned int bkt_clamp;						unsigned int bkt_clamp;
	unsigned int rq_clamp;						unsigned int rq_clamp;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	/*								/*
	 * If sched_uclamp_used was enabled after task @p was		 * If sched_uclamp_used was enabled after task @p was
	 * we could end up with unbalanced call to uclamp_rq_		 * we could end up with unbalanced call to uclamp_rq_
	 *								 *
	 * In this case the uc_se->active flag should be fals		 * In this case the uc_se->active flag should be fals
	 * accounting was performed at enqueue time and we ca		 * accounting was performed at enqueue time and we ca
	 * here.							 * here.
	 *								 *
	 * Need to be careful of the following enqueue/dequeu		 * Need to be careful of the following enqueue/dequeu
	 * problem too							 * problem too
	 *								 *
	 *	enqueue(taskA)						 *	enqueue(taskA)
	 *	// sched_uclamp_used gets enabled			 *	// sched_uclamp_used gets enabled
	 *	enqueue(taskB)						 *	enqueue(taskB)
	 *	dequeue(taskA)						 *	dequeue(taskA)
	 *	// Must not decrement bucket->tasks here		 *	// Must not decrement bucket->tasks here
	 *	dequeue(taskB)						 *	dequeue(taskB)
	 *								 *
	 * where we could end up with stale data in uc_se and		 * where we could end up with stale data in uc_se and
	 * bucket[uc_se->bucket_id].					 * bucket[uc_se->bucket_id].
	 *								 *
	 * The following check here eliminates the possibilit		 * The following check here eliminates the possibilit
	 */								 */
	if (unlikely(!uc_se->active))					if (unlikely(!uc_se->active))
		return;								return;

	bucket = &uc_rq->bucket[uc_se->bucket_id];			bucket = &uc_rq->bucket[uc_se->bucket_id];

	SCHED_WARN_ON(!bucket->tasks);					SCHED_WARN_ON(!bucket->tasks);
	if (likely(bucket->tasks))					if (likely(bucket->tasks))
		bucket->tasks--;						bucket->tasks--;

	uc_se->active = false;						uc_se->active = false;

	/*								/*
	 * Keep "local max aggregation" simple and accept to 		 * Keep "local max aggregation" simple and accept to 
	 * overboost some RUNNABLE tasks in the same bucket.		 * overboost some RUNNABLE tasks in the same bucket.
	 * The rq clamp bucket value is reset to its base val		 * The rq clamp bucket value is reset to its base val
	 * there are no more RUNNABLE tasks refcounting it.		 * there are no more RUNNABLE tasks refcounting it.
	 */								 */
	if (likely(bucket->tasks))					if (likely(bucket->tasks))
		return;								return;

	rq_clamp = uclamp_rq_get(rq, clamp_id);				rq_clamp = uclamp_rq_get(rq, clamp_id);
	/*								/*
	 * Defensive programming: this should never happen. I		 * Defensive programming: this should never happen. I
	 * e.g. due to future modification, warn and fixup th		 * e.g. due to future modification, warn and fixup th
	 */								 */
	SCHED_WARN_ON(bucket->value > rq_clamp);			SCHED_WARN_ON(bucket->value > rq_clamp);
	if (bucket->value >= rq_clamp) {				if (bucket->value >= rq_clamp) {
		bkt_clamp = uclamp_rq_max_value(rq, clamp_id,			bkt_clamp = uclamp_rq_max_value(rq, clamp_id,
		uclamp_rq_set(rq, clamp_id, bkt_clamp);				uclamp_rq_set(rq, clamp_id, bkt_clamp);
	}								}
}								}

static inline void uclamp_rq_inc(struct rq *rq, struct task_s	static inline void uclamp_rq_inc(struct rq *rq, struct task_s
{								{
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;

	/*								/*
	 * Avoid any overhead until uclamp is actually used b		 * Avoid any overhead until uclamp is actually used b
	 *								 *
	 * The condition is constructed such that a NOP is ge		 * The condition is constructed such that a NOP is ge
	 * sched_uclamp_used is disabled.				 * sched_uclamp_used is disabled.
	 */								 */
	if (!static_branch_unlikely(&sched_uclamp_used))		if (!static_branch_unlikely(&sched_uclamp_used))
		return;								return;

	if (unlikely(!p->sched_class->uclamp_enabled))			if (unlikely(!p->sched_class->uclamp_enabled))
		return;								return;

	for_each_clamp_id(clamp_id)					for_each_clamp_id(clamp_id)
		uclamp_rq_inc_id(rq, p, clamp_id);				uclamp_rq_inc_id(rq, p, clamp_id);

	/* Reset clamp idle holding when there is one RUNNABL		/* Reset clamp idle holding when there is one RUNNABL
	if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)			if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
		rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;				rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}								}

static inline void uclamp_rq_dec(struct rq *rq, struct task_s	static inline void uclamp_rq_dec(struct rq *rq, struct task_s
{								{
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;

	/*								/*
	 * Avoid any overhead until uclamp is actually used b		 * Avoid any overhead until uclamp is actually used b
	 *								 *
	 * The condition is constructed such that a NOP is ge		 * The condition is constructed such that a NOP is ge
	 * sched_uclamp_used is disabled.				 * sched_uclamp_used is disabled.
	 */								 */
	if (!static_branch_unlikely(&sched_uclamp_used))		if (!static_branch_unlikely(&sched_uclamp_used))
		return;								return;

	if (unlikely(!p->sched_class->uclamp_enabled))			if (unlikely(!p->sched_class->uclamp_enabled))
		return;								return;

	for_each_clamp_id(clamp_id)					for_each_clamp_id(clamp_id)
		uclamp_rq_dec_id(rq, p, clamp_id);				uclamp_rq_dec_id(rq, p, clamp_id);
}								}

static inline void uclamp_rq_reinc_id(struct rq *rq, struct t	static inline void uclamp_rq_reinc_id(struct rq *rq, struct t
				      enum uclamp_id clamp_id					      enum uclamp_id clamp_id
{								{
	if (!p->uclamp[clamp_id].active)				if (!p->uclamp[clamp_id].active)
		return;								return;

	uclamp_rq_dec_id(rq, p, clamp_id);				uclamp_rq_dec_id(rq, p, clamp_id);
	uclamp_rq_inc_id(rq, p, clamp_id);				uclamp_rq_inc_id(rq, p, clamp_id);

	/*								/*
	 * Make sure to clear the idle flag if we've transien		 * Make sure to clear the idle flag if we've transien
	 * active tasks on rq.						 * active tasks on rq.
	 */								 */
	if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCL		if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCL
		rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;				rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
}								}

static inline void						static inline void
uclamp_update_active(struct task_struct *p)			uclamp_update_active(struct task_struct *p)
{								{
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	/*								/*
	 * Lock the task and the rq where the task is (or was		 * Lock the task and the rq where the task is (or was
	 *								 *
	 * We might lock the (previous) rq of a !RUNNABLE tas		 * We might lock the (previous) rq of a !RUNNABLE tas
	 * price to pay to safely serialize util_{min,max} up		 * price to pay to safely serialize util_{min,max} up
	 * enqueues, dequeues and migration operations.			 * enqueues, dequeues and migration operations.
	 * This is the same locking schema used by __set_cpus		 * This is the same locking schema used by __set_cpus
	 */								 */
	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);

	/*								/*
	 * Setting the clamp bucket is serialized by task_rq_		 * Setting the clamp bucket is serialized by task_rq_
	 * If the task is not yet RUNNABLE and its task_struc		 * If the task is not yet RUNNABLE and its task_struc
	 * affecting a valid clamp bucket, the next time it's		 * affecting a valid clamp bucket, the next time it's
	 * it will already see the updated clamp bucket value		 * it will already see the updated clamp bucket value
	 */								 */
	for_each_clamp_id(clamp_id)					for_each_clamp_id(clamp_id)
		uclamp_rq_reinc_id(rq, p, clamp_id);				uclamp_rq_reinc_id(rq, p, clamp_id);

	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
}								}

#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
static inline void						static inline void
uclamp_update_active_tasks(struct cgroup_subsys_state *css)	uclamp_update_active_tasks(struct cgroup_subsys_state *css)
{								{
	struct css_task_iter it;					struct css_task_iter it;
	struct task_struct *p;						struct task_struct *p;

	css_task_iter_start(css, 0, &it);				css_task_iter_start(css, 0, &it);
	while ((p = css_task_iter_next(&it)))				while ((p = css_task_iter_next(&it)))
		uclamp_update_active(p);					uclamp_update_active(p);
	css_task_iter_end(&it);						css_task_iter_end(&it);
}								}

static void cpu_util_update_eff(struct cgroup_subsys_state *c	static void cpu_util_update_eff(struct cgroup_subsys_state *c
#endif								#endif

#ifdef CONFIG_SYSCTL						#ifdef CONFIG_SYSCTL
#ifdef CONFIG_UCLAMP_TASK					#ifdef CONFIG_UCLAMP_TASK
#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
static void uclamp_update_root_tg(void)				static void uclamp_update_root_tg(void)
{								{
	struct task_group *tg = &root_task_group;			struct task_group *tg = &root_task_group;

	uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],			uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
		      sysctl_sched_uclamp_util_min, false);			      sysctl_sched_uclamp_util_min, false);
	uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],			uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
		      sysctl_sched_uclamp_util_max, false);			      sysctl_sched_uclamp_util_max, false);

	rcu_read_lock();						rcu_read_lock();
	cpu_util_update_eff(&root_task_group.css);			cpu_util_update_eff(&root_task_group.css);
	rcu_read_unlock();						rcu_read_unlock();
}								}
#else								#else
static void uclamp_update_root_tg(void) { }			static void uclamp_update_root_tg(void) { }
#endif								#endif

static void uclamp_sync_util_min_rt_default(void)		static void uclamp_sync_util_min_rt_default(void)
{								{
	struct task_struct *g, *p;					struct task_struct *g, *p;

	/*								/*
	 * copy_process()			sysctl_uclamp		 * copy_process()			sysctl_uclamp
	 *					  uclamp_min_		 *					  uclamp_min_
	 *   write_lock(&tasklist_lock)		  read_lock(&		 *   write_lock(&tasklist_lock)		  read_lock(&
	 *   // link thread			  smp_mb__aft		 *   // link thread			  smp_mb__aft
	 *   write_unlock(&tasklist_lock)	  read_unlock		 *   write_unlock(&tasklist_lock)	  read_unlock
	 *   sched_post_fork()			  for_each_pr		 *   sched_post_fork()			  for_each_pr
	 *     __uclamp_sync_rt()		    __uclamp_		 *     __uclamp_sync_rt()		    __uclamp_
	 *								 *
	 * Ensures that either sched_post_fork() will observe		 * Ensures that either sched_post_fork() will observe
	 * uclamp_min_rt or for_each_process_thread() will ob		 * uclamp_min_rt or for_each_process_thread() will ob
	 * task.							 * task.
	 */								 */
	read_lock(&tasklist_lock);					read_lock(&tasklist_lock);
	smp_mb__after_spinlock();					smp_mb__after_spinlock();
	read_unlock(&tasklist_lock);					read_unlock(&tasklist_lock);

	rcu_read_lock();						rcu_read_lock();
	for_each_process_thread(g, p)					for_each_process_thread(g, p)
		uclamp_update_util_min_rt_default(p);				uclamp_update_util_min_rt_default(p);
	rcu_read_unlock();						rcu_read_unlock();
}								}

static int sysctl_sched_uclamp_handler(struct ctl_table *tabl	static int sysctl_sched_uclamp_handler(struct ctl_table *tabl
				void *buffer, size_t *lenp, l					void *buffer, size_t *lenp, l
{								{
	bool update_root_tg = false;					bool update_root_tg = false;
	int old_min, old_max, old_min_rt;				int old_min, old_max, old_min_rt;
	int result;							int result;

	guard(mutex)(&uclamp_mutex);					guard(mutex)(&uclamp_mutex);

	old_min = sysctl_sched_uclamp_util_min;				old_min = sysctl_sched_uclamp_util_min;
	old_max = sysctl_sched_uclamp_util_max;				old_max = sysctl_sched_uclamp_util_max;
	old_min_rt = sysctl_sched_uclamp_util_min_rt_default;		old_min_rt = sysctl_sched_uclamp_util_min_rt_default;

	result = proc_dointvec(table, write, buffer, lenp, pp		result = proc_dointvec(table, write, buffer, lenp, pp
	if (result)							if (result)
		goto undo;							goto undo;
	if (!write)							if (!write)
		return 0;							return 0;

	if (sysctl_sched_uclamp_util_min > sysctl_sched_uclam		if (sysctl_sched_uclamp_util_min > sysctl_sched_uclam
	    sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCA		    sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCA
	    sysctl_sched_uclamp_util_min_rt_default > SCHED_C		    sysctl_sched_uclamp_util_min_rt_default > SCHED_C

		result = -EINVAL;						result = -EINVAL;
		goto undo;							goto undo;
	}								}

	if (old_min != sysctl_sched_uclamp_util_min) {			if (old_min != sysctl_sched_uclamp_util_min) {
		uclamp_se_set(&uclamp_default[UCLAMP_MIN],			uclamp_se_set(&uclamp_default[UCLAMP_MIN],
			      sysctl_sched_uclamp_util_min, f				      sysctl_sched_uclamp_util_min, f
		update_root_tg = true;						update_root_tg = true;
	}								}
	if (old_max != sysctl_sched_uclamp_util_max) {			if (old_max != sysctl_sched_uclamp_util_max) {
		uclamp_se_set(&uclamp_default[UCLAMP_MAX],			uclamp_se_set(&uclamp_default[UCLAMP_MAX],
			      sysctl_sched_uclamp_util_max, f				      sysctl_sched_uclamp_util_max, f
		update_root_tg = true;						update_root_tg = true;
	}								}

	if (update_root_tg) {						if (update_root_tg) {
		static_branch_enable(&sched_uclamp_used);			static_branch_enable(&sched_uclamp_used);
		uclamp_update_root_tg();					uclamp_update_root_tg();
	}								}

	if (old_min_rt != sysctl_sched_uclamp_util_min_rt_def		if (old_min_rt != sysctl_sched_uclamp_util_min_rt_def
		static_branch_enable(&sched_uclamp_used);			static_branch_enable(&sched_uclamp_used);
		uclamp_sync_util_min_rt_default();				uclamp_sync_util_min_rt_default();
	}								}

	/*								/*
	 * We update all RUNNABLE tasks only when task groups		 * We update all RUNNABLE tasks only when task groups
	 * Otherwise, keep it simple and do just a lazy updat		 * Otherwise, keep it simple and do just a lazy updat
	 * task enqueue time.						 * task enqueue time.
	 */								 */
	return 0;							return 0;

undo:								undo:
	sysctl_sched_uclamp_util_min = old_min;				sysctl_sched_uclamp_util_min = old_min;
	sysctl_sched_uclamp_util_max = old_max;				sysctl_sched_uclamp_util_max = old_max;
	sysctl_sched_uclamp_util_min_rt_default = old_min_rt;		sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
	return result;							return result;
}								}
#endif								#endif
#endif								#endif

static int uclamp_validate(struct task_struct *p,		static int uclamp_validate(struct task_struct *p,
			   const struct sched_attr *attr)				   const struct sched_attr *attr)
{								{
	int util_min = p->uclamp_req[UCLAMP_MIN].value;			int util_min = p->uclamp_req[UCLAMP_MIN].value;
	int util_max = p->uclamp_req[UCLAMP_MAX].value;			int util_max = p->uclamp_req[UCLAMP_MAX].value;

	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
		util_min = attr->sched_util_min;				util_min = attr->sched_util_min;

		if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)			if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
			return -EINVAL;							return -EINVAL;
	}								}

	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
		util_max = attr->sched_util_max;				util_max = attr->sched_util_max;

		if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)			if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
			return -EINVAL;							return -EINVAL;
	}								}

	if (util_min != -1 && util_max != -1 && util_min > ut		if (util_min != -1 && util_max != -1 && util_min > ut
		return -EINVAL;							return -EINVAL;

	/*								/*
	 * We have valid uclamp attributes; make sure uclamp 		 * We have valid uclamp attributes; make sure uclamp 
	 *								 *
	 * We need to do that here, because enabling static b		 * We need to do that here, because enabling static b
	 * blocking operation which obviously cannot be done 		 * blocking operation which obviously cannot be done 
	 * scheduler locks.						 * scheduler locks.
	 */								 */
	static_branch_enable(&sched_uclamp_used);			static_branch_enable(&sched_uclamp_used);

	return 0;							return 0;
}								}

static bool uclamp_reset(const struct sched_attr *attr,		static bool uclamp_reset(const struct sched_attr *attr,
			 enum uclamp_id clamp_id,					 enum uclamp_id clamp_id,
			 struct uclamp_se *uc_se)					 struct uclamp_se *uc_se)
{								{
	/* Reset on sched class change for a non user-defined		/* Reset on sched class change for a non user-defined
	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM		if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM
	    !uc_se->user_defined)					    !uc_se->user_defined)
		return true;							return true;

	/* Reset on sched_util_{min,max} == -1. */			/* Reset on sched_util_{min,max} == -1. */
	if (clamp_id == UCLAMP_MIN &&					if (clamp_id == UCLAMP_MIN &&
	    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&		    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	    attr->sched_util_min == -1) {				    attr->sched_util_min == -1) {
		return true;							return true;
	}								}

	if (clamp_id == UCLAMP_MAX &&					if (clamp_id == UCLAMP_MAX &&
	    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&		    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	    attr->sched_util_max == -1) {				    attr->sched_util_max == -1) {
		return true;							return true;
	}								}

	return false;							return false;
}								}

static void __setscheduler_uclamp(struct task_struct *p,	static void __setscheduler_uclamp(struct task_struct *p,
				  const struct sched_attr *at					  const struct sched_attr *at
{								{
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;

	for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
		struct uclamp_se *uc_se = &p->uclamp_req[clam			struct uclamp_se *uc_se = &p->uclamp_req[clam
		unsigned int value;						unsigned int value;

		if (!uclamp_reset(attr, clamp_id, uc_se))			if (!uclamp_reset(attr, clamp_id, uc_se))
			continue;							continue;

		/*								/*
		 * RT by default have a 100% boost value that			 * RT by default have a 100% boost value that
		 * at runtime.							 * at runtime.
		 */								 */
		if (unlikely(rt_task(p) && clamp_id == UCLAMP			if (unlikely(rt_task(p) && clamp_id == UCLAMP
			value = sysctl_sched_uclamp_util_min_				value = sysctl_sched_uclamp_util_min_
		else								else
			value = uclamp_none(clamp_id);					value = uclamp_none(clamp_id);

		uclamp_se_set(uc_se, value, false);				uclamp_se_set(uc_se, value, false);

	}								}

	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM		if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAM
		return;								return;

	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
	    attr->sched_util_min != -1) {				    attr->sched_util_min != -1) {
		uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],			uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
			      attr->sched_util_min, true);				      attr->sched_util_min, true);
	}								}

	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
	    attr->sched_util_max != -1) {				    attr->sched_util_max != -1) {
		uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],			uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
			      attr->sched_util_max, true);				      attr->sched_util_max, true);
	}								}
}								}

static void uclamp_fork(struct task_struct *p)			static void uclamp_fork(struct task_struct *p)
{								{
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;

	/*								/*
	 * We don't need to hold task_rq_lock() when updating		 * We don't need to hold task_rq_lock() when updating
	 * as the task is still at its early fork stages.		 * as the task is still at its early fork stages.
	 */								 */
	for_each_clamp_id(clamp_id)					for_each_clamp_id(clamp_id)
		p->uclamp[clamp_id].active = false;				p->uclamp[clamp_id].active = false;

	if (likely(!p->sched_reset_on_fork))				if (likely(!p->sched_reset_on_fork))
		return;								return;

	for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
		uclamp_se_set(&p->uclamp_req[clamp_id],				uclamp_se_set(&p->uclamp_req[clamp_id],
			      uclamp_none(clamp_id), false);				      uclamp_none(clamp_id), false);
	}								}
}								}

static void uclamp_post_fork(struct task_struct *p)		static void uclamp_post_fork(struct task_struct *p)
{								{
	uclamp_update_util_min_rt_default(p);				uclamp_update_util_min_rt_default(p);
}								}

static void __init init_uclamp_rq(struct rq *rq)		static void __init init_uclamp_rq(struct rq *rq)
{								{
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;
	struct uclamp_rq *uc_rq = rq->uclamp;				struct uclamp_rq *uc_rq = rq->uclamp;

	for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
		uc_rq[clamp_id] = (struct uclamp_rq) {				uc_rq[clamp_id] = (struct uclamp_rq) {
			.value = uclamp_none(clamp_id)					.value = uclamp_none(clamp_id)
		};								};
	}								}

	rq->uclamp_flags = UCLAMP_FLAG_IDLE;				rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}								}

static void __init init_uclamp(void)				static void __init init_uclamp(void)
{								{
	struct uclamp_se uc_max = {};					struct uclamp_se uc_max = {};
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;
	int cpu;							int cpu;

	for_each_possible_cpu(cpu)					for_each_possible_cpu(cpu)
		init_uclamp_rq(cpu_rq(cpu));					init_uclamp_rq(cpu_rq(cpu));

	for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
		uclamp_se_set(&init_task.uclamp_req[clamp_id]			uclamp_se_set(&init_task.uclamp_req[clamp_id]
			      uclamp_none(clamp_id), false);				      uclamp_none(clamp_id), false);
	}								}

	/* System defaults allow max clamp values for both in		/* System defaults allow max clamp values for both in
	uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false		uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false
	for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
		uclamp_default[clamp_id] = uc_max;				uclamp_default[clamp_id] = uc_max;
#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
		root_task_group.uclamp_req[clamp_id] = uc_max			root_task_group.uclamp_req[clamp_id] = uc_max
		root_task_group.uclamp[clamp_id] = uc_max;			root_task_group.uclamp[clamp_id] = uc_max;
#endif								#endif
	}								}
}								}

#else /* CONFIG_UCLAMP_TASK */					#else /* CONFIG_UCLAMP_TASK */
static inline void uclamp_rq_inc(struct rq *rq, struct task_s	static inline void uclamp_rq_inc(struct rq *rq, struct task_s
static inline void uclamp_rq_dec(struct rq *rq, struct task_s	static inline void uclamp_rq_dec(struct rq *rq, struct task_s
static inline int uclamp_validate(struct task_struct *p,	static inline int uclamp_validate(struct task_struct *p,
				  const struct sched_attr *at					  const struct sched_attr *at
{								{
	return -EOPNOTSUPP;						return -EOPNOTSUPP;
}								}
static void __setscheduler_uclamp(struct task_struct *p,	static void __setscheduler_uclamp(struct task_struct *p,
				  const struct sched_attr *at					  const struct sched_attr *at
static inline void uclamp_fork(struct task_struct *p) { }	static inline void uclamp_fork(struct task_struct *p) { }
static inline void uclamp_post_fork(struct task_struct *p) { 	static inline void uclamp_post_fork(struct task_struct *p) { 
static inline void init_uclamp(void) { }			static inline void init_uclamp(void) { }
#endif /* CONFIG_UCLAMP_TASK */					#endif /* CONFIG_UCLAMP_TASK */

bool sched_task_on_rq(struct task_struct *p)			bool sched_task_on_rq(struct task_struct *p)
{								{
	return task_on_rq_queued(p);					return task_on_rq_queued(p);
}								}

unsigned long get_wchan(struct task_struct *p)			unsigned long get_wchan(struct task_struct *p)
{								{
	unsigned long ip = 0;						unsigned long ip = 0;
	unsigned int state;						unsigned int state;

	if (!p || p == current)						if (!p || p == current)
		return 0;							return 0;

	/* Only get wchan if task is blocked and we can keep 		/* Only get wchan if task is blocked and we can keep 
	raw_spin_lock_irq(&p->pi_lock);					raw_spin_lock_irq(&p->pi_lock);
	state = READ_ONCE(p->__state);					state = READ_ONCE(p->__state);
	smp_rmb(); /* see try_to_wake_up() */				smp_rmb(); /* see try_to_wake_up() */
	if (state != TASK_RUNNING && state != TASK_WAKING && 		if (state != TASK_RUNNING && state != TASK_WAKING && 
		ip = __get_wchan(p);						ip = __get_wchan(p);
	raw_spin_unlock_irq(&p->pi_lock);				raw_spin_unlock_irq(&p->pi_lock);

	return ip;							return ip;
}								}

static inline void enqueue_task(struct rq *rq, struct task_st	static inline void enqueue_task(struct rq *rq, struct task_st
{								{
	if (!(flags & ENQUEUE_NOCLOCK))					if (!(flags & ENQUEUE_NOCLOCK))
		update_rq_clock(rq);						update_rq_clock(rq);

	if (!(flags & ENQUEUE_RESTORE)) {				if (!(flags & ENQUEUE_RESTORE)) {
		sched_info_enqueue(rq, p);					sched_info_enqueue(rq, p);
		psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(			psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(
	}								}

	uclamp_rq_inc(rq, p);						uclamp_rq_inc(rq, p);
	p->sched_class->enqueue_task(rq, p, flags);			p->sched_class->enqueue_task(rq, p, flags);

	if (sched_core_enabled(rq))					if (sched_core_enabled(rq))
		sched_core_enqueue(rq, p);					sched_core_enqueue(rq, p);
}								}

static inline void dequeue_task(struct rq *rq, struct task_st	static inline void dequeue_task(struct rq *rq, struct task_st
{								{
	if (sched_core_enabled(rq))					if (sched_core_enabled(rq))
		sched_core_dequeue(rq, p, flags);				sched_core_dequeue(rq, p, flags);

	if (!(flags & DEQUEUE_NOCLOCK))					if (!(flags & DEQUEUE_NOCLOCK))
		update_rq_clock(rq);						update_rq_clock(rq);

	if (!(flags & DEQUEUE_SAVE)) {					if (!(flags & DEQUEUE_SAVE)) {
		sched_info_dequeue(rq, p);					sched_info_dequeue(rq, p);
		psi_dequeue(p, flags & DEQUEUE_SLEEP);				psi_dequeue(p, flags & DEQUEUE_SLEEP);
	}								}

	uclamp_rq_dec(rq, p);						uclamp_rq_dec(rq, p);
	p->sched_class->dequeue_task(rq, p, flags);			p->sched_class->dequeue_task(rq, p, flags);
}								}

void activate_task(struct rq *rq, struct task_struct *p, int 	void activate_task(struct rq *rq, struct task_struct *p, int 
{								{
	if (task_on_rq_migrating(p))					if (task_on_rq_migrating(p))
		flags |= ENQUEUE_MIGRATED;					flags |= ENQUEUE_MIGRATED;
	if (flags & ENQUEUE_MIGRATED)					if (flags & ENQUEUE_MIGRATED)
		sched_mm_cid_migrate_to(rq, p);					sched_mm_cid_migrate_to(rq, p);

	enqueue_task(rq, p, flags);					enqueue_task(rq, p, flags);

	p->on_rq = TASK_ON_RQ_QUEUED;					p->on_rq = TASK_ON_RQ_QUEUED;
}								}

void deactivate_task(struct rq *rq, struct task_struct *p, in	void deactivate_task(struct rq *rq, struct task_struct *p, in
{								{
	p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_M		p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_M

	dequeue_task(rq, p, flags);					dequeue_task(rq, p, flags);
}								}

static inline int __normal_prio(int policy, int rt_prio, int 	static inline int __normal_prio(int policy, int rt_prio, int 
{								{
	int prio;							int prio;

	if (dl_policy(policy))						if (dl_policy(policy))
		prio = MAX_DL_PRIO - 1;						prio = MAX_DL_PRIO - 1;
	else if (rt_policy(policy))					else if (rt_policy(policy))
		prio = MAX_RT_PRIO - 1 - rt_prio;				prio = MAX_RT_PRIO - 1 - rt_prio;
	else								else
		prio = NICE_TO_PRIO(nice);					prio = NICE_TO_PRIO(nice);

	return prio;							return prio;
}								}

/*								/*
 * Calculate the expected normal priority: i.e. priority	 * Calculate the expected normal priority: i.e. priority
 * without taking RT-inheritance into account. Might be		 * without taking RT-inheritance into account. Might be
 * boosted by interactivity modifiers. Changes upon fork,	 * boosted by interactivity modifiers. Changes upon fork,
 * setprio syscalls, and whenever the interactivity		 * setprio syscalls, and whenever the interactivity
 * estimator recalculates.					 * estimator recalculates.
 */								 */
static inline int normal_prio(struct task_struct *p)		static inline int normal_prio(struct task_struct *p)
{								{
	return __normal_prio(p->policy, p->rt_priority, PRIO_		return __normal_prio(p->policy, p->rt_priority, PRIO_
}								}

/*								/*
 * Calculate the current priority, i.e. the priority		 * Calculate the current priority, i.e. the priority
 * taken into account by the scheduler. This value might	 * taken into account by the scheduler. This value might
 * be boosted by RT tasks, or might be boosted by		 * be boosted by RT tasks, or might be boosted by
 * interactivity modifiers. Will be RT if the task got		 * interactivity modifiers. Will be RT if the task got
 * RT-boosted. If not then it returns p->normal_prio.		 * RT-boosted. If not then it returns p->normal_prio.
 */								 */
static int effective_prio(struct task_struct *p)		static int effective_prio(struct task_struct *p)
{								{
	p->normal_prio = normal_prio(p);				p->normal_prio = normal_prio(p);
	/*								/*
	 * If we are RT tasks or we were boosted to RT priori		 * If we are RT tasks or we were boosted to RT priori
	 * keep the priority unchanged. Otherwise, update pri		 * keep the priority unchanged. Otherwise, update pri
	 * to the normal priority:					 * to the normal priority:
	 */								 */
	if (!rt_prio(p->prio))						if (!rt_prio(p->prio))
		return p->normal_prio;						return p->normal_prio;
	return p->prio;							return p->prio;
}								}

/**								/**
 * task_curr - is this task currently executing on a CPU?	 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.					 * @p: the task in question.
 *								 *
 * Return: 1 if the task is currently executing. 0 otherwise.	 * Return: 1 if the task is currently executing. 0 otherwise.
 */								 */
inline int task_curr(const struct task_struct *p)		inline int task_curr(const struct task_struct *p)
{								{
	return cpu_curr(task_cpu(p)) == p;				return cpu_curr(task_cpu(p)) == p;
}								}

/*								/*
 * switched_from, switched_to and prio_changed must _NOT_ dro	 * switched_from, switched_to and prio_changed must _NOT_ dro
 * use the balance_callback list if you want balancing.		 * use the balance_callback list if you want balancing.
 *								 *
 * this means any call to check_class_changed() must be follo	 * this means any call to check_class_changed() must be follo
 * balance_callback().						 * balance_callback().
 */								 */
static inline void check_class_changed(struct rq *rq, struct 	static inline void check_class_changed(struct rq *rq, struct 
				       const struct sched_cla					       const struct sched_cla
				       int oldprio)						       int oldprio)
{								{
	if (prev_class != p->sched_class) {				if (prev_class != p->sched_class) {
		if (prev_class->switched_from)					if (prev_class->switched_from)
			prev_class->switched_from(rq, p);				prev_class->switched_from(rq, p);

		p->sched_class->switched_to(rq, p);				p->sched_class->switched_to(rq, p);
	} else if (oldprio != p->prio || dl_task(p))			} else if (oldprio != p->prio || dl_task(p))
		p->sched_class->prio_changed(rq, p, oldprio);			p->sched_class->prio_changed(rq, p, oldprio);
}								}

void check_preempt_curr(struct rq *rq, struct task_struct *p,	void check_preempt_curr(struct rq *rq, struct task_struct *p,
{								{
	if (p->sched_class == rq->curr->sched_class)			if (p->sched_class == rq->curr->sched_class)
		rq->curr->sched_class->check_preempt_curr(rq,			rq->curr->sched_class->check_preempt_curr(rq,
	else if (sched_class_above(p->sched_class, rq->curr->		else if (sched_class_above(p->sched_class, rq->curr->
		resched_curr(rq);						resched_curr(rq);

	/*								/*
	 * A queue event has occurred, and we're going to sch		 * A queue event has occurred, and we're going to sch
	 * this case, we can save a useless back to back cloc		 * this case, we can save a useless back to back cloc
	 */								 */
	if (task_on_rq_queued(rq->curr) && test_tsk_need_resc		if (task_on_rq_queued(rq->curr) && test_tsk_need_resc
		rq_clock_skip_update(rq);					rq_clock_skip_update(rq);
}								}

static __always_inline						static __always_inline
int __task_state_match(struct task_struct *p, unsigned int st	int __task_state_match(struct task_struct *p, unsigned int st
{								{
	if (READ_ONCE(p->__state) & state)				if (READ_ONCE(p->__state) & state)
		return 1;							return 1;

#ifdef CONFIG_PREEMPT_RT					#ifdef CONFIG_PREEMPT_RT
	if (READ_ONCE(p->saved_state) & state)				if (READ_ONCE(p->saved_state) & state)
		return -1;							return -1;
#endif								#endif
	return 0;							return 0;
}								}

static __always_inline						static __always_inline
int task_state_match(struct task_struct *p, unsigned int stat	int task_state_match(struct task_struct *p, unsigned int stat
{								{
#ifdef CONFIG_PREEMPT_RT					#ifdef CONFIG_PREEMPT_RT
	int match;							int match;

	/*								/*
	 * Serialize against current_save_and_set_rtlock_wait		 * Serialize against current_save_and_set_rtlock_wait
	 * current_restore_rtlock_saved_state().			 * current_restore_rtlock_saved_state().
	 */								 */
	raw_spin_lock_irq(&p->pi_lock);					raw_spin_lock_irq(&p->pi_lock);
	match = __task_state_match(p, state);				match = __task_state_match(p, state);
	raw_spin_unlock_irq(&p->pi_lock);				raw_spin_unlock_irq(&p->pi_lock);

	return match;							return match;
#else								#else
	return __task_state_match(p, state);				return __task_state_match(p, state);
#endif								#endif
}								}

/*								/*
 * wait_task_inactive - wait for a thread to unschedule.	 * wait_task_inactive - wait for a thread to unschedule.
 *								 *
 * Wait for the thread to block in any of the states set in @	 * Wait for the thread to block in any of the states set in @
 * If it changes, i.e. @p might have woken up, then return ze	 * If it changes, i.e. @p might have woken up, then return ze
 * succeed in waiting for @p to be off its CPU, we return a p	 * succeed in waiting for @p to be off its CPU, we return a p
 * (its total switch count).  If a second call a short while 	 * (its total switch count).  If a second call a short while 
 * same number, the caller can be sure that @p has remained u	 * same number, the caller can be sure that @p has remained u
 * whole time.							 * whole time.
 *								 *
 * The caller must ensure that the task *will* unschedule som	 * The caller must ensure that the task *will* unschedule som
 * else this function might spin for a *long* time. This func	 * else this function might spin for a *long* time. This func
 * be called with interrupts off, or it may introduce deadloc	 * be called with interrupts off, or it may introduce deadloc
 * smp_call_function() if an IPI is sent by the same process 	 * smp_call_function() if an IPI is sent by the same process 
 * waiting to become inactive.					 * waiting to become inactive.
 */								 */
unsigned long wait_task_inactive(struct task_struct *p, unsig	unsigned long wait_task_inactive(struct task_struct *p, unsig
{								{
	int running, queued, match;					int running, queued, match;
	struct rq_flags rf;						struct rq_flags rf;
	unsigned long ncsw;						unsigned long ncsw;
	struct rq *rq;							struct rq *rq;

	for (;;) {							for (;;) {
		/*								/*
		 * We do the initial early heuristics without			 * We do the initial early heuristics without
		 * any task-queue locks at all. We'll only tr			 * any task-queue locks at all. We'll only tr
		 * the runqueue lock when things look like th			 * the runqueue lock when things look like th
		 * work out!							 * work out!
		 */								 */
		rq = task_rq(p);						rq = task_rq(p);

		/*								/*
		 * If the task is actively running on another			 * If the task is actively running on another
		 * still, just relax and busy-wait without ho			 * still, just relax and busy-wait without ho
		 * any locks.							 * any locks.
		 *								 *
		 * NOTE! Since we don't hold any locks, it's 			 * NOTE! Since we don't hold any locks, it's 
		 * even sure that "rq" stays as the right run			 * even sure that "rq" stays as the right run
		 * But we don't care, since "task_on_cpu()" w			 * But we don't care, since "task_on_cpu()" w
		 * return false if the runqueue has changed a			 * return false if the runqueue has changed a
		 * is actually now running somewhere else!			 * is actually now running somewhere else!
		 */								 */
		while (task_on_cpu(rq, p)) {					while (task_on_cpu(rq, p)) {
			if (!task_state_match(p, match_state)				if (!task_state_match(p, match_state)
				return 0;							return 0;
			cpu_relax();							cpu_relax();
		}								}

		/*								/*
		 * Ok, time to look more closely! We need the			 * Ok, time to look more closely! We need the
		 * lock now, to be *sure*. If we're wrong, we			 * lock now, to be *sure*. If we're wrong, we
		 * just go back and repeat.					 * just go back and repeat.
		 */								 */
		rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
		trace_sched_wait_task(p);					trace_sched_wait_task(p);
		running = task_on_cpu(rq, p);					running = task_on_cpu(rq, p);
		queued = task_on_rq_queued(p);					queued = task_on_rq_queued(p);
		ncsw = 0;							ncsw = 0;
		if ((match = __task_state_match(p, match_stat			if ((match = __task_state_match(p, match_stat
			/*								/*
			 * When matching on p->saved_state, c				 * When matching on p->saved_state, c
			 * still queued so it will wait.				 * still queued so it will wait.
			 */								 */
			if (match < 0)							if (match < 0)
				queued = 1;							queued = 1;
			ncsw = p->nvcsw | LONG_MIN; /* sets M				ncsw = p->nvcsw | LONG_MIN; /* sets M
		}								}
		task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);

		/*								/*
		 * If it changed from the expected state, bai			 * If it changed from the expected state, bai
		 */								 */
		if (unlikely(!ncsw))						if (unlikely(!ncsw))
			break;								break;

		/*								/*
		 * Was it really running after all now that w			 * Was it really running after all now that w
		 * checked with the proper locks actually hel			 * checked with the proper locks actually hel
		 *								 *
		 * Oops. Go back and try again..				 * Oops. Go back and try again..
		 */								 */
		if (unlikely(running)) {					if (unlikely(running)) {
			cpu_relax();							cpu_relax();
			continue;							continue;
		}								}

		/*								/*
		 * It's not enough that it's not actively run			 * It's not enough that it's not actively run
		 * it must be off the runqueue _entirely_, an			 * it must be off the runqueue _entirely_, an
		 * preempted!							 * preempted!
		 *								 *
		 * So if it was still runnable (but just not 			 * So if it was still runnable (but just not 
		 * running right now), it's preempted, and we			 * running right now), it's preempted, and we
		 * yield - it could be a while.					 * yield - it could be a while.
		 */								 */
		if (unlikely(queued)) {						if (unlikely(queued)) {
			ktime_t to = NSEC_PER_SEC / HZ;					ktime_t to = NSEC_PER_SEC / HZ;

			set_current_state(TASK_UNINTERRUPTIBL				set_current_state(TASK_UNINTERRUPTIBL
			schedule_hrtimeout(&to, HRTIMER_MODE_				schedule_hrtimeout(&to, HRTIMER_MODE_
			continue;							continue;
		}								}

		/*								/*
		 * Ahh, all good. It wasn't running, and it w			 * Ahh, all good. It wasn't running, and it w
		 * runnable, which means that it will never b			 * runnable, which means that it will never b
		 * running in the future either. We're all do			 * running in the future either. We're all do
		 */								 */
		break;								break;
	}								}

	return ncsw;							return ncsw;
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP

static void							static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_	__do_set_cpus_allowed(struct task_struct *p, struct affinity_

static int __set_cpus_allowed_ptr(struct task_struct *p,	static int __set_cpus_allowed_ptr(struct task_struct *p,
				  struct affinity_context *ct					  struct affinity_context *ct

static void migrate_disable_switch(struct rq *rq, struct task	static void migrate_disable_switch(struct rq *rq, struct task
{								{
	struct affinity_context ac = {					struct affinity_context ac = {
		.new_mask  = cpumask_of(rq->cpu),				.new_mask  = cpumask_of(rq->cpu),
		.flags     = SCA_MIGRATE_DISABLE,				.flags     = SCA_MIGRATE_DISABLE,
	};								};

	if (likely(!p->migration_disabled))				if (likely(!p->migration_disabled))
		return;								return;

	if (p->cpus_ptr != &p->cpus_mask)				if (p->cpus_ptr != &p->cpus_mask)
		return;								return;

	/*								/*
	 * Violates locking rules! see comment in __do_set_cp		 * Violates locking rules! see comment in __do_set_cp
	 */								 */
	__do_set_cpus_allowed(p, &ac);					__do_set_cpus_allowed(p, &ac);
}								}

void migrate_disable(void)					void migrate_disable(void)
{								{
	struct task_struct *p = current;				struct task_struct *p = current;

	if (p->migration_disabled) {					if (p->migration_disabled) {
		p->migration_disabled++;					p->migration_disabled++;
		return;								return;
	}								}

	preempt_disable();						preempt_disable();
	this_rq()->nr_pinned++;						this_rq()->nr_pinned++;
	p->migration_disabled = 1;					p->migration_disabled = 1;
	preempt_enable();						preempt_enable();
}								}
EXPORT_SYMBOL_GPL(migrate_disable);				EXPORT_SYMBOL_GPL(migrate_disable);

void migrate_enable(void)					void migrate_enable(void)
{								{
	struct task_struct *p = current;				struct task_struct *p = current;
	struct affinity_context ac = {					struct affinity_context ac = {
		.new_mask  = &p->cpus_mask,					.new_mask  = &p->cpus_mask,
		.flags     = SCA_MIGRATE_ENABLE,				.flags     = SCA_MIGRATE_ENABLE,
	};								};

	if (p->migration_disabled > 1) {				if (p->migration_disabled > 1) {
		p->migration_disabled--;					p->migration_disabled--;
		return;								return;
	}								}

	if (WARN_ON_ONCE(!p->migration_disabled))			if (WARN_ON_ONCE(!p->migration_disabled))
		return;								return;

	/*								/*
	 * Ensure stop_task runs either before or after this,		 * Ensure stop_task runs either before or after this,
	 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't		 * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't
	 */								 */
	preempt_disable();						preempt_disable();
	if (p->cpus_ptr != &p->cpus_mask)				if (p->cpus_ptr != &p->cpus_mask)
		__set_cpus_allowed_ptr(p, &ac);					__set_cpus_allowed_ptr(p, &ac);
	/*								/*
	 * Mustn't clear migration_disabled() until cpus_ptr 		 * Mustn't clear migration_disabled() until cpus_ptr 
	 * regular cpus_mask, otherwise things that race (eg.		 * regular cpus_mask, otherwise things that race (eg.
	 * select_fallback_rq) get confused.				 * select_fallback_rq) get confused.
	 */								 */
	barrier();							barrier();
	p->migration_disabled = 0;					p->migration_disabled = 0;
	this_rq()->nr_pinned--;						this_rq()->nr_pinned--;
	preempt_enable();						preempt_enable();
}								}
EXPORT_SYMBOL_GPL(migrate_enable);				EXPORT_SYMBOL_GPL(migrate_enable);

static inline bool rq_has_pinned_tasks(struct rq *rq)		static inline bool rq_has_pinned_tasks(struct rq *rq)
{								{
	return rq->nr_pinned;						return rq->nr_pinned;
}								}

/*								/*
 * Per-CPU kthreads are allowed to run on !active && online C	 * Per-CPU kthreads are allowed to run on !active && online C
 * __set_cpus_allowed_ptr() and select_fallback_rq().		 * __set_cpus_allowed_ptr() and select_fallback_rq().
 */								 */
static inline bool is_cpu_allowed(struct task_struct *p, int 	static inline bool is_cpu_allowed(struct task_struct *p, int 
{								{
	/* When not in the task's cpumask, no point in lookin		/* When not in the task's cpumask, no point in lookin
	if (!cpumask_test_cpu(cpu, p->cpus_ptr))			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
		return false;							return false;

	/* migrate_disabled() must be allowed to finish. */		/* migrate_disabled() must be allowed to finish. */
	if (is_migration_disabled(p))					if (is_migration_disabled(p))
		return cpu_online(cpu);						return cpu_online(cpu);

	/* Non kernel threads are not allowed during either o		/* Non kernel threads are not allowed during either o
	if (!(p->flags & PF_KTHREAD))					if (!(p->flags & PF_KTHREAD))
		return cpu_active(cpu) && task_cpu_possible(c			return cpu_active(cpu) && task_cpu_possible(c

	/* KTHREAD_IS_PER_CPU is always allowed. */			/* KTHREAD_IS_PER_CPU is always allowed. */
	if (kthread_is_per_cpu(p))					if (kthread_is_per_cpu(p))
		return cpu_online(cpu);						return cpu_online(cpu);

	/* Regular kernel threads don't get to stay during of		/* Regular kernel threads don't get to stay during of
	if (cpu_dying(cpu))						if (cpu_dying(cpu))
		return false;							return false;

	/* But are allowed during online. */				/* But are allowed during online. */
	return cpu_online(cpu);						return cpu_online(cpu);
}								}

/*								/*
 * This is how migration works:					 * This is how migration works:
 *								 *
 * 1) we invoke migration_cpu_stop() on the target CPU using	 * 1) we invoke migration_cpu_stop() on the target CPU using
 *    stop_one_cpu().						 *    stop_one_cpu().
 * 2) stopper starts to run (implicitly forcing the migrated 	 * 2) stopper starts to run (implicitly forcing the migrated 
 *    off the CPU)						 *    off the CPU)
 * 3) it checks whether the migrated task is still in the wro	 * 3) it checks whether the migrated task is still in the wro
 * 4) if it's in the wrong runqueue then the migration thread	 * 4) if it's in the wrong runqueue then the migration thread
 *    it and puts it into the right queue.			 *    it and puts it into the right queue.
 * 5) stopper completes and stop_one_cpu() returns and the mi	 * 5) stopper completes and stop_one_cpu() returns and the mi
 *    is done.							 *    is done.
 */								 */

/*								/*
 * move_queued_task - move a queued task to new rq.		 * move_queued_task - move a queued task to new rq.
 *								 *
 * Returns (locked) new rq. Old rq's lock is released.		 * Returns (locked) new rq. Old rq's lock is released.
 */								 */
static struct rq *move_queued_task(struct rq *rq, struct rq_f	static struct rq *move_queued_task(struct rq *rq, struct rq_f
				   struct task_struct *p, int					   struct task_struct *p, int
{								{
	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	deactivate_task(rq, p, DEQUEUE_NOCLOCK);			deactivate_task(rq, p, DEQUEUE_NOCLOCK);
	set_task_cpu(p, new_cpu);					set_task_cpu(p, new_cpu);
	rq_unlock(rq, rf);						rq_unlock(rq, rf);

	rq = cpu_rq(new_cpu);						rq = cpu_rq(new_cpu);

	rq_lock(rq, rf);						rq_lock(rq, rf);
	WARN_ON_ONCE(task_cpu(p) != new_cpu);				WARN_ON_ONCE(task_cpu(p) != new_cpu);
	activate_task(rq, p, 0);					activate_task(rq, p, 0);
	check_preempt_curr(rq, p, 0);					check_preempt_curr(rq, p, 0);

	return rq;							return rq;
}								}

struct migration_arg {						struct migration_arg {
	struct task_struct		*task;				struct task_struct		*task;
	int				dest_cpu;			int				dest_cpu;
	struct set_affinity_pending	*pending;			struct set_affinity_pending	*pending;
};								};

/*								/*
 * @refs: number of wait_for_completion()			 * @refs: number of wait_for_completion()
 * @stop_pending: is @stop_work in use				 * @stop_pending: is @stop_work in use
 */								 */
struct set_affinity_pending {					struct set_affinity_pending {
	refcount_t		refs;					refcount_t		refs;
	unsigned int		stop_pending;				unsigned int		stop_pending;
	struct completion	done;					struct completion	done;
	struct cpu_stop_work	stop_work;				struct cpu_stop_work	stop_work;
	struct migration_arg	arg;					struct migration_arg	arg;
};								};

/*								/*
 * Move (not current) task off this CPU, onto the destination	 * Move (not current) task off this CPU, onto the destination
 * this because either it can't run here any more (set_cpus_a	 * this because either it can't run here any more (set_cpus_a
 * away from this CPU, or CPU going down), or because we're	 * away from this CPU, or CPU going down), or because we're
 * attempting to rebalance this task on exec (sched_exec).	 * attempting to rebalance this task on exec (sched_exec).
 *								 *
 * So we race with normal scheduler movements, but that's OK,	 * So we race with normal scheduler movements, but that's OK,
 * as the task is no longer on this CPU.			 * as the task is no longer on this CPU.
 */								 */
static struct rq *__migrate_task(struct rq *rq, struct rq_fla	static struct rq *__migrate_task(struct rq *rq, struct rq_fla
				 struct task_struct *p, int d					 struct task_struct *p, int d
{								{
	/* Affinity changed (again). */					/* Affinity changed (again). */
	if (!is_cpu_allowed(p, dest_cpu))				if (!is_cpu_allowed(p, dest_cpu))
		return rq;							return rq;

	rq = move_queued_task(rq, rf, p, dest_cpu);			rq = move_queued_task(rq, rf, p, dest_cpu);

	return rq;							return rq;
}								}

/*								/*
 * migration_cpu_stop - this will be executed by a highprio s	 * migration_cpu_stop - this will be executed by a highprio s
 * and performs thread migration by bumping thread off CPU th	 * and performs thread migration by bumping thread off CPU th
 * 'pushing' onto another runqueue.				 * 'pushing' onto another runqueue.
 */								 */
static int migration_cpu_stop(void *data)			static int migration_cpu_stop(void *data)
{								{
	struct migration_arg *arg = data;				struct migration_arg *arg = data;
	struct set_affinity_pending *pending = arg->pending;		struct set_affinity_pending *pending = arg->pending;
	struct task_struct *p = arg->task;				struct task_struct *p = arg->task;
	struct rq *rq = this_rq();					struct rq *rq = this_rq();
	bool complete = false;						bool complete = false;
	struct rq_flags rf;						struct rq_flags rf;

	/*								/*
	 * The original target CPU might have gone down and w		 * The original target CPU might have gone down and w
	 * be on another CPU but it doesn't matter.			 * be on another CPU but it doesn't matter.
	 */								 */
	local_irq_save(rf.flags);					local_irq_save(rf.flags);
	/*								/*
	 * We need to explicitly wake pending tasks before ru		 * We need to explicitly wake pending tasks before ru
	 * __migrate_task() such that we will not miss enforc		 * __migrate_task() such that we will not miss enforc
	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_		 * during wakeups, see set_cpus_allowed_ptr()'s TASK_
	 */								 */
	flush_smp_call_function_queue();				flush_smp_call_function_queue();

	raw_spin_lock(&p->pi_lock);					raw_spin_lock(&p->pi_lock);
	rq_lock(rq, &rf);						rq_lock(rq, &rf);

	/*								/*
	 * If we were passed a pending, then ->stop_pending w		 * If we were passed a pending, then ->stop_pending w
	 * p->migration_pending must have remained stable.		 * p->migration_pending must have remained stable.
	 */								 */
	WARN_ON_ONCE(pending && pending != p->migration_pendi		WARN_ON_ONCE(pending && pending != p->migration_pendi

	/*								/*
	 * If task_rq(p) != rq, it cannot be migrated here, b		 * If task_rq(p) != rq, it cannot be migrated here, b
	 * holding rq->lock, if p->on_rq == 0 it cannot get e		 * holding rq->lock, if p->on_rq == 0 it cannot get e
	 * we're holding p->pi_lock.					 * we're holding p->pi_lock.
	 */								 */
	if (task_rq(p) == rq) {						if (task_rq(p) == rq) {
		if (is_migration_disabled(p))					if (is_migration_disabled(p))
			goto out;							goto out;

		if (pending) {							if (pending) {
			p->migration_pending = NULL;					p->migration_pending = NULL;
			complete = true;						complete = true;

			if (cpumask_test_cpu(task_cpu(p), &p-				if (cpumask_test_cpu(task_cpu(p), &p-
				goto out;							goto out;
		}								}

		if (task_on_rq_queued(p)) {					if (task_on_rq_queued(p)) {
			update_rq_clock(rq);						update_rq_clock(rq);
			rq = __migrate_task(rq, &rf, p, arg->				rq = __migrate_task(rq, &rf, p, arg->
		} else {							} else {
			p->wake_cpu = arg->dest_cpu;					p->wake_cpu = arg->dest_cpu;
		}								}

		/*								/*
		 * XXX __migrate_task() can fail, at which po			 * XXX __migrate_task() can fail, at which po
		 * up running on a dodgy CPU, AFAICT this can			 * up running on a dodgy CPU, AFAICT this can
		 * during CPU hotplug, at which point we'll g			 * during CPU hotplug, at which point we'll g
		 * anyway, so it's probably not a big deal.			 * anyway, so it's probably not a big deal.
		 */								 */

	} else if (pending) {						} else if (pending) {
		/*								/*
		 * This happens when we get migrated between 			 * This happens when we get migrated between 
		 * preempt_enable() and scheduling the stoppe			 * preempt_enable() and scheduling the stoppe
		 * point we're a regular task again and not c			 * point we're a regular task again and not c
		 *								 *
		 * A !PREEMPT kernel has a giant hole here, w			 * A !PREEMPT kernel has a giant hole here, w
		 * more likely.							 * more likely.
		 */								 */

		/*								/*
		 * The task moved before the stopper got to r			 * The task moved before the stopper got to r
		 * ->pi_lock, so the allowed mask is stable -			 * ->pi_lock, so the allowed mask is stable -
		 * somewhere allowed, we're done.				 * somewhere allowed, we're done.
		 */								 */
		if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr			if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr
			p->migration_pending = NULL;					p->migration_pending = NULL;
			complete = true;						complete = true;
			goto out;							goto out;
		}								}

		/*								/*
		 * When migrate_enable() hits a rq mis-match 			 * When migrate_enable() hits a rq mis-match 
		 * determine is_migration_disabled() and so h			 * determine is_migration_disabled() and so h
		 * it.								 * it.
		 */								 */
		WARN_ON_ONCE(!pending->stop_pending);				WARN_ON_ONCE(!pending->stop_pending);
		preempt_disable();						preempt_disable();
		task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
		stop_one_cpu_nowait(task_cpu(p), migration_cp			stop_one_cpu_nowait(task_cpu(p), migration_cp
				    &pending->arg, &pending->					    &pending->arg, &pending->
		preempt_enable();						preempt_enable();
		return 0;							return 0;
	}								}
out:								out:
	if (pending)							if (pending)
		pending->stop_pending = false;					pending->stop_pending = false;
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);

	if (complete)							if (complete)
		complete_all(&pending->done);					complete_all(&pending->done);

	return 0;							return 0;
}								}

int push_cpu_stop(void *arg)					int push_cpu_stop(void *arg)
{								{
	struct rq *lowest_rq = NULL, *rq = this_rq();			struct rq *lowest_rq = NULL, *rq = this_rq();
	struct task_struct *p = arg;					struct task_struct *p = arg;

	raw_spin_lock_irq(&p->pi_lock);					raw_spin_lock_irq(&p->pi_lock);
	raw_spin_rq_lock(rq);						raw_spin_rq_lock(rq);

	if (task_rq(p) != rq)						if (task_rq(p) != rq)
		goto out_unlock;						goto out_unlock;

	if (is_migration_disabled(p)) {					if (is_migration_disabled(p)) {
		p->migration_flags |= MDF_PUSH;					p->migration_flags |= MDF_PUSH;
		goto out_unlock;						goto out_unlock;
	}								}

	p->migration_flags &= ~MDF_PUSH;				p->migration_flags &= ~MDF_PUSH;

	if (p->sched_class->find_lock_rq)				if (p->sched_class->find_lock_rq)
		lowest_rq = p->sched_class->find_lock_rq(p, r			lowest_rq = p->sched_class->find_lock_rq(p, r

	if (!lowest_rq)							if (!lowest_rq)
		goto out_unlock;						goto out_unlock;

	// XXX validate p is still the highest prio task		// XXX validate p is still the highest prio task
	if (task_rq(p) == rq) {						if (task_rq(p) == rq) {
		deactivate_task(rq, p, 0);					deactivate_task(rq, p, 0);
		set_task_cpu(p, lowest_rq->cpu);				set_task_cpu(p, lowest_rq->cpu);
		activate_task(lowest_rq, p, 0);					activate_task(lowest_rq, p, 0);
		resched_curr(lowest_rq);					resched_curr(lowest_rq);
	}								}

	double_unlock_balance(rq, lowest_rq);				double_unlock_balance(rq, lowest_rq);

out_unlock:							out_unlock:
	rq->push_busy = false;						rq->push_busy = false;
	raw_spin_rq_unlock(rq);						raw_spin_rq_unlock(rq);
	raw_spin_unlock_irq(&p->pi_lock);				raw_spin_unlock_irq(&p->pi_lock);

	put_task_struct(p);						put_task_struct(p);
	return 0;							return 0;
}								}

/*								/*
 * sched_class::set_cpus_allowed must do the below, but is no	 * sched_class::set_cpus_allowed must do the below, but is no
 * actually call this function.					 * actually call this function.
 */								 */
void set_cpus_allowed_common(struct task_struct *p, struct af	void set_cpus_allowed_common(struct task_struct *p, struct af
{								{
	if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DI		if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DI
		p->cpus_ptr = ctx->new_mask;					p->cpus_ptr = ctx->new_mask;
		return;								return;
	}								}

	cpumask_copy(&p->cpus_mask, ctx->new_mask);			cpumask_copy(&p->cpus_mask, ctx->new_mask);
	p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);		p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);

	/*								/*
	 * Swap in a new user_cpus_ptr if SCA_USER flag set		 * Swap in a new user_cpus_ptr if SCA_USER flag set
	 */								 */
	if (ctx->flags & SCA_USER)					if (ctx->flags & SCA_USER)
		swap(p->user_cpus_ptr, ctx->user_mask);				swap(p->user_cpus_ptr, ctx->user_mask);
}								}

static void							static void
__do_set_cpus_allowed(struct task_struct *p, struct affinity_	__do_set_cpus_allowed(struct task_struct *p, struct affinity_
{								{
	struct rq *rq = task_rq(p);					struct rq *rq = task_rq(p);
	bool queued, running;						bool queued, running;

	/*								/*
	 * This here violates the locking rules for affinity,		 * This here violates the locking rules for affinity,
	 * supposed to change these variables while holding b		 * supposed to change these variables while holding b
	 * p->pi_lock.							 * p->pi_lock.
	 *								 *
	 * HOWEVER, it magically works, because ttwu() is the		 * HOWEVER, it magically works, because ttwu() is the
	 * accesses these variables under p->pi_lock and only		 * accesses these variables under p->pi_lock and only
	 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're		 * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're
	 * before finish_task().					 * before finish_task().
	 *								 *
	 * XXX do further audits, this smells like something 		 * XXX do further audits, this smells like something 
	 */								 */
	if (ctx->flags & SCA_MIGRATE_DISABLE)				if (ctx->flags & SCA_MIGRATE_DISABLE)
		SCHED_WARN_ON(!p->on_cpu);					SCHED_WARN_ON(!p->on_cpu);
	else								else
		lockdep_assert_held(&p->pi_lock);				lockdep_assert_held(&p->pi_lock);

	queued = task_on_rq_queued(p);					queued = task_on_rq_queued(p);
	running = task_current(rq, p);					running = task_current(rq, p);

	if (queued) {							if (queued) {
		/*								/*
		 * Because __kthread_bind() calls this on blo			 * Because __kthread_bind() calls this on blo
		 * holding rq->lock.						 * holding rq->lock.
		 */								 */
		lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);
		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO			dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO
	}								}
	if (running)							if (running)
		put_prev_task(rq, p);						put_prev_task(rq, p);

	p->sched_class->set_cpus_allowed(p, ctx);			p->sched_class->set_cpus_allowed(p, ctx);

	if (queued)							if (queued)
		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE			enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE
	if (running)							if (running)
		set_next_task(rq, p);						set_next_task(rq, p);
}								}

/*								/*
 * Used for kthread_bind() and select_fallback_rq(), in both 	 * Used for kthread_bind() and select_fallback_rq(), in both 
 * affinity (if any) should be destroyed too.			 * affinity (if any) should be destroyed too.
 */								 */
void do_set_cpus_allowed(struct task_struct *p, const struct 	void do_set_cpus_allowed(struct task_struct *p, const struct 
{								{
	struct affinity_context ac = {					struct affinity_context ac = {
		.new_mask  = new_mask,						.new_mask  = new_mask,
		.user_mask = NULL,						.user_mask = NULL,
		.flags     = SCA_USER,	/* clear the user req			.flags     = SCA_USER,	/* clear the user req
	};								};
	union cpumask_rcuhead {						union cpumask_rcuhead {
		cpumask_t cpumask;						cpumask_t cpumask;
		struct rcu_head rcu;						struct rcu_head rcu;
	};								};

	__do_set_cpus_allowed(p, &ac);					__do_set_cpus_allowed(p, &ac);

	/*								/*
	 * Because this is called with p->pi_lock held, it is		 * Because this is called with p->pi_lock held, it is
	 * to use kfree() here (when PREEMPT_RT=y), therefore		 * to use kfree() here (when PREEMPT_RT=y), therefore
	 * kfree_rcu().							 * kfree_rcu().
	 */								 */
	kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu)		kfree_rcu((union cpumask_rcuhead *)ac.user_mask, rcu)
}								}

static cpumask_t *alloc_user_cpus_ptr(int node)			static cpumask_t *alloc_user_cpus_ptr(int node)
{								{
	/*								/*
	 * See do_set_cpus_allowed() above for the rcu_head u		 * See do_set_cpus_allowed() above for the rcu_head u
	 */								 */
	int size = max_t(int, cpumask_size(), sizeof(struct r		int size = max_t(int, cpumask_size(), sizeof(struct r

	return kmalloc_node(size, GFP_KERNEL, node);			return kmalloc_node(size, GFP_KERNEL, node);
}								}

int dup_user_cpus_ptr(struct task_struct *dst, struct task_st	int dup_user_cpus_ptr(struct task_struct *dst, struct task_st
		      int node)							      int node)
{								{
	cpumask_t *user_mask;						cpumask_t *user_mask;
	unsigned long flags;						unsigned long flags;

	/*								/*
	 * Always clear dst->user_cpus_ptr first as their use		 * Always clear dst->user_cpus_ptr first as their use
	 * may differ by now due to racing.				 * may differ by now due to racing.
	 */								 */
	dst->user_cpus_ptr = NULL;					dst->user_cpus_ptr = NULL;

	/*								/*
	 * This check is racy and losing the race is a valid 		 * This check is racy and losing the race is a valid 
	 * It is not worth the extra overhead of taking the p		 * It is not worth the extra overhead of taking the p
	 * every fork/clone.						 * every fork/clone.
	 */								 */
	if (data_race(!src->user_cpus_ptr))				if (data_race(!src->user_cpus_ptr))
		return 0;							return 0;

	user_mask = alloc_user_cpus_ptr(node);				user_mask = alloc_user_cpus_ptr(node);
	if (!user_mask)							if (!user_mask)
		return -ENOMEM;							return -ENOMEM;

	/*								/*
	 * Use pi_lock to protect content of user_cpus_ptr		 * Use pi_lock to protect content of user_cpus_ptr
	 *								 *
	 * Though unlikely, user_cpus_ptr can be reset to NUL		 * Though unlikely, user_cpus_ptr can be reset to NUL
	 * do_set_cpus_allowed().					 * do_set_cpus_allowed().
	 */								 */
	raw_spin_lock_irqsave(&src->pi_lock, flags);			raw_spin_lock_irqsave(&src->pi_lock, flags);
	if (src->user_cpus_ptr) {					if (src->user_cpus_ptr) {
		swap(dst->user_cpus_ptr, user_mask);				swap(dst->user_cpus_ptr, user_mask);
		cpumask_copy(dst->user_cpus_ptr, src->user_cp			cpumask_copy(dst->user_cpus_ptr, src->user_cp
	}								}
	raw_spin_unlock_irqrestore(&src->pi_lock, flags);		raw_spin_unlock_irqrestore(&src->pi_lock, flags);

	if (unlikely(user_mask))					if (unlikely(user_mask))
		kfree(user_mask);						kfree(user_mask);

	return 0;							return 0;
}								}

static inline struct cpumask *clear_user_cpus_ptr(struct task	static inline struct cpumask *clear_user_cpus_ptr(struct task
{								{
	struct cpumask *user_mask = NULL;				struct cpumask *user_mask = NULL;

	swap(p->user_cpus_ptr, user_mask);				swap(p->user_cpus_ptr, user_mask);

	return user_mask;						return user_mask;
}								}

void release_user_cpus_ptr(struct task_struct *p)		void release_user_cpus_ptr(struct task_struct *p)
{								{
	kfree(clear_user_cpus_ptr(p));					kfree(clear_user_cpus_ptr(p));
}								}

/*								/*
 * This function is wildly self concurrent; here be dragons.	 * This function is wildly self concurrent; here be dragons.
 *								 *
 *								 *
 * When given a valid mask, __set_cpus_allowed_ptr() must blo	 * When given a valid mask, __set_cpus_allowed_ptr() must blo
 * designated task is enqueued on an allowed CPU. If that tas	 * designated task is enqueued on an allowed CPU. If that tas
 * running, we have to kick it out using the CPU stopper.	 * running, we have to kick it out using the CPU stopper.
 *								 *
 * Migrate-Disable comes along and tramples all over our nice	 * Migrate-Disable comes along and tramples all over our nice
 * Consider:							 * Consider:
 *								 *
 *     Initial conditions: P0->cpus_mask = [0, 1]		 *     Initial conditions: P0->cpus_mask = [0, 1]
 *								 *
 *     P0@CPU0                  P1				 *     P0@CPU0                  P1
 *								 *
 *     migrate_disable();					 *     migrate_disable();
 *     <preempted>						 *     <preempted>
 *                              set_cpus_allowed_ptr(P0, [1])	 *                              set_cpus_allowed_ptr(P0, [1])
 *								 *
 * P1 *cannot* return from this set_cpus_allowed_ptr() call u	 * P1 *cannot* return from this set_cpus_allowed_ptr() call u
 * its outermost migrate_enable() (i.e. it exits its Migrate-	 * its outermost migrate_enable() (i.e. it exits its Migrate-
 * This means we need the following scheme:			 * This means we need the following scheme:
 *								 *
 *     P0@CPU0                  P1				 *     P0@CPU0                  P1
 *								 *
 *     migrate_disable();					 *     migrate_disable();
 *     <preempted>						 *     <preempted>
 *                              set_cpus_allowed_ptr(P0, [1])	 *                              set_cpus_allowed_ptr(P0, [1])
 *                                <blocks>			 *                                <blocks>
 *     <resumes>						 *     <resumes>
 *     migrate_enable();					 *     migrate_enable();
 *       __set_cpus_allowed_ptr();				 *       __set_cpus_allowed_ptr();
 *       <wakes local stopper>					 *       <wakes local stopper>
 *                         `--> <woken on migration completio	 *                         `--> <woken on migration completio
 *								 *
 * Now the fun stuff: there may be several P1-like tasks, i.e	 * Now the fun stuff: there may be several P1-like tasks, i.e
 * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affini	 * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affini
 * task p are serialized by p->pi_lock, which we can leverage	 * task p are serialized by p->pi_lock, which we can leverage
 * should come into effect at the end of the Migrate-Disable 	 * should come into effect at the end of the Migrate-Disable 
 * one. This means we only need to track a single cpumask (i.	 * one. This means we only need to track a single cpumask (i.
 * but we still need to properly signal those waiting tasks a	 * but we still need to properly signal those waiting tasks a
 * moment.							 * moment.
 *								 *
 * This is implemented using struct set_affinity_pending. The	 * This is implemented using struct set_affinity_pending. The
 * __set_cpus_allowed_ptr() caller within a given Migrate-Dis	 * __set_cpus_allowed_ptr() caller within a given Migrate-Dis
 * setup an instance of that struct and install it on the tar	 * setup an instance of that struct and install it on the tar
 * Any and all further callers will reuse that instance. Thos	 * Any and all further callers will reuse that instance. Thos
 * a completion signaled at the tail of the CPU stopper callb	 * a completion signaled at the tail of the CPU stopper callb
 * on the end of the Migrate-Disable region (i.e. outermost m	 * on the end of the Migrate-Disable region (i.e. outermost m
 *								 *
 *								 *
 * (1) In the cases covered above. There is one more where th	 * (1) In the cases covered above. There is one more where th
 * signaled within affine_move_task() itself: when a subseque	 * signaled within affine_move_task() itself: when a subseque
 * occurs after the stopper bailed out due to the targeted ta	 * occurs after the stopper bailed out due to the targeted ta
 * Migrate-Disable. Consider:					 * Migrate-Disable. Consider:
 *								 *
 *     Initial conditions: P0->cpus_mask = [0, 1]		 *     Initial conditions: P0->cpus_mask = [0, 1]
 *								 *
 *     CPU0		  P1				P2	 *     CPU0		  P1				P2
 *     <P0>							 *     <P0>
 *       migrate_disable();					 *       migrate_disable();
 *       <preempted>						 *       <preempted>
 *                        set_cpus_allowed_ptr(P0, [1]);	 *                        set_cpus_allowed_ptr(P0, [1]);
 *                          <blocks>				 *                          <blocks>
 *     <migration/0>						 *     <migration/0>
 *       migration_cpu_stop()					 *       migration_cpu_stop()
 *         is_migration_disabled()				 *         is_migration_disabled()
 *           <bails>						 *           <bails>
 *                                                       set_	 *                                                       set_
 *                                                         <s	 *                                                         <s
 *                          <awakes>				 *                          <awakes>
 *								 *
 * Note that the above is safe vs a concurrent migrate_enable	 * Note that the above is safe vs a concurrent migrate_enable
 * pending affinity completion is preceded by an uninstallati	 * pending affinity completion is preceded by an uninstallati
 * p->migration_pending done with p->pi_lock held.		 * p->migration_pending done with p->pi_lock held.
 */								 */
static int affine_move_task(struct rq *rq, struct task_struct	static int affine_move_task(struct rq *rq, struct task_struct
			    int dest_cpu, unsigned int flags)				    int dest_cpu, unsigned int flags)
	__releases(rq->lock)						__releases(rq->lock)
	__releases(p->pi_lock)						__releases(p->pi_lock)
{								{
	struct set_affinity_pending my_pending = { }, *pendin		struct set_affinity_pending my_pending = { }, *pendin
	bool stop_pending, complete = false;				bool stop_pending, complete = false;

	/* Can the task run on the task's current CPU? If so,		/* Can the task run on the task's current CPU? If so,
	if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {		if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) {
		struct task_struct *push_task = NULL;				struct task_struct *push_task = NULL;

		if ((flags & SCA_MIGRATE_ENABLE) &&				if ((flags & SCA_MIGRATE_ENABLE) &&
		    (p->migration_flags & MDF_PUSH) && !rq->p			    (p->migration_flags & MDF_PUSH) && !rq->p
			rq->push_busy = true;						rq->push_busy = true;
			push_task = get_task_struct(p);					push_task = get_task_struct(p);
		}								}

		/*								/*
		 * If there are pending waiters, but no pendi			 * If there are pending waiters, but no pendi
		 * then complete now.						 * then complete now.
		 */								 */
		pending = p->migration_pending;					pending = p->migration_pending;
		if (pending && !pending->stop_pending) {			if (pending && !pending->stop_pending) {
			p->migration_pending = NULL;					p->migration_pending = NULL;
			complete = true;						complete = true;
		}								}

		preempt_disable();						preempt_disable();
		task_rq_unlock(rq, p, rf);					task_rq_unlock(rq, p, rf);
		if (push_task) {						if (push_task) {
			stop_one_cpu_nowait(rq->cpu, push_cpu				stop_one_cpu_nowait(rq->cpu, push_cpu
					    p, &rq->push_work						    p, &rq->push_work
		}								}
		preempt_enable();						preempt_enable();

		if (complete)							if (complete)
			complete_all(&pending->done);					complete_all(&pending->done);

		return 0;							return 0;
	}								}

	if (!(flags & SCA_MIGRATE_ENABLE)) {				if (!(flags & SCA_MIGRATE_ENABLE)) {
		/* serialized by p->pi_lock */					/* serialized by p->pi_lock */
		if (!p->migration_pending) {					if (!p->migration_pending) {
			/* Install the request */					/* Install the request */
			refcount_set(&my_pending.refs, 1);				refcount_set(&my_pending.refs, 1);
			init_completion(&my_pending.done);				init_completion(&my_pending.done);
			my_pending.arg = (struct migration_ar				my_pending.arg = (struct migration_ar
				.task = p,							.task = p,
				.dest_cpu = dest_cpu,						.dest_cpu = dest_cpu,
				.pending = &my_pending,						.pending = &my_pending,
			};								};

			p->migration_pending = &my_pending;				p->migration_pending = &my_pending;
		} else {							} else {
			pending = p->migration_pending;					pending = p->migration_pending;
			refcount_inc(&pending->refs);					refcount_inc(&pending->refs);
			/*								/*
			 * Affinity has changed, but we've al				 * Affinity has changed, but we've al
			 * pending. migration_cpu_stop() *mus				 * pending. migration_cpu_stop() *mus
			 * we risk a completion of the pendin				 * we risk a completion of the pendin
			 * task on a disallowed CPU.					 * task on a disallowed CPU.
			 *								 *
			 * Serialized by p->pi_lock, so this 				 * Serialized by p->pi_lock, so this 
			 */								 */
			pending->arg.dest_cpu = dest_cpu;				pending->arg.dest_cpu = dest_cpu;
		}								}
	}								}
	pending = p->migration_pending;					pending = p->migration_pending;
	/*								/*
	 * - !MIGRATE_ENABLE:						 * - !MIGRATE_ENABLE:
	 *   we'll have installed a pending if there wasn't o		 *   we'll have installed a pending if there wasn't o
	 *								 *
	 * - MIGRATE_ENABLE:						 * - MIGRATE_ENABLE:
	 *   we're here because the current CPU isn't matchin		 *   we're here because the current CPU isn't matchin
	 *   the only way that can happen is because of a con		 *   the only way that can happen is because of a con
	 *   set_cpus_allowed_ptr() call, which should then s		 *   set_cpus_allowed_ptr() call, which should then s
	 *   pending completion.					 *   pending completion.
	 *								 *
	 * Either way, we really should have a @pending here.		 * Either way, we really should have a @pending here.
	 */								 */
	if (WARN_ON_ONCE(!pending)) {					if (WARN_ON_ONCE(!pending)) {
		task_rq_unlock(rq, p, rf);					task_rq_unlock(rq, p, rf);
		return -EINVAL;							return -EINVAL;
	}								}

	if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TA		if (task_on_cpu(rq, p) || READ_ONCE(p->__state) == TA
		/*								/*
		 * MIGRATE_ENABLE gets here because 'p == cur			 * MIGRATE_ENABLE gets here because 'p == cur
		 * anything else we cannot do is_migration_di			 * anything else we cannot do is_migration_di
		 * and have the stopper function handle it al			 * and have the stopper function handle it al
		 */								 */
		stop_pending = pending->stop_pending;				stop_pending = pending->stop_pending;
		if (!stop_pending)						if (!stop_pending)
			pending->stop_pending = true;					pending->stop_pending = true;

		if (flags & SCA_MIGRATE_ENABLE)					if (flags & SCA_MIGRATE_ENABLE)
			p->migration_flags &= ~MDF_PUSH;				p->migration_flags &= ~MDF_PUSH;

		preempt_disable();						preempt_disable();
		task_rq_unlock(rq, p, rf);					task_rq_unlock(rq, p, rf);
		if (!stop_pending) {						if (!stop_pending) {
			stop_one_cpu_nowait(cpu_of(rq), migra				stop_one_cpu_nowait(cpu_of(rq), migra
					    &pending->arg, &p						    &pending->arg, &p
		}								}
		preempt_enable();						preempt_enable();

		if (flags & SCA_MIGRATE_ENABLE)					if (flags & SCA_MIGRATE_ENABLE)
			return 0;							return 0;
	} else {							} else {

		if (!is_migration_disabled(p)) {				if (!is_migration_disabled(p)) {
			if (task_on_rq_queued(p))					if (task_on_rq_queued(p))
				rq = move_queued_task(rq, rf,					rq = move_queued_task(rq, rf,

			if (!pending->stop_pending) {					if (!pending->stop_pending) {
				p->migration_pending = NULL;					p->migration_pending = NULL;
				complete = true;						complete = true;
			}								}
		}								}
		task_rq_unlock(rq, p, rf);					task_rq_unlock(rq, p, rf);

		if (complete)							if (complete)
			complete_all(&pending->done);					complete_all(&pending->done);
	}								}

	wait_for_completion(&pending->done);				wait_for_completion(&pending->done);

	if (refcount_dec_and_test(&pending->refs))			if (refcount_dec_and_test(&pending->refs))
		wake_up_var(&pending->refs); /* No UaF, just 			wake_up_var(&pending->refs); /* No UaF, just 

	/*								/*
	 * Block the original owner of &pending until all sub		 * Block the original owner of &pending until all sub
	 * have seen the completion and decremented the refco		 * have seen the completion and decremented the refco
	 */								 */
	wait_var_event(&my_pending.refs, !refcount_read(&my_p		wait_var_event(&my_pending.refs, !refcount_read(&my_p

	/* ARGH */							/* ARGH */
	WARN_ON_ONCE(my_pending.stop_pending);				WARN_ON_ONCE(my_pending.stop_pending);

	return 0;							return 0;
}								}

/*								/*
 * Called with both p->pi_lock and rq->lock held; drops both 	 * Called with both p->pi_lock and rq->lock held; drops both 
 */								 */
static int __set_cpus_allowed_ptr_locked(struct task_struct *	static int __set_cpus_allowed_ptr_locked(struct task_struct *
					 struct affinity_cont						 struct affinity_cont
					 struct rq *rq,							 struct rq *rq,
					 struct rq_flags *rf)						 struct rq_flags *rf)
	__releases(rq->lock)						__releases(rq->lock)
	__releases(p->pi_lock)						__releases(p->pi_lock)
{								{
	const struct cpumask *cpu_allowed_mask = task_cpu_pos		const struct cpumask *cpu_allowed_mask = task_cpu_pos
	const struct cpumask *cpu_valid_mask = cpu_active_mas		const struct cpumask *cpu_valid_mask = cpu_active_mas
	bool kthread = p->flags & PF_KTHREAD;				bool kthread = p->flags & PF_KTHREAD;
	unsigned int dest_cpu;						unsigned int dest_cpu;
	int ret = 0;							int ret = 0;

	update_rq_clock(rq);						update_rq_clock(rq);

	if (kthread || is_migration_disabled(p)) {			if (kthread || is_migration_disabled(p)) {
		/*								/*
		 * Kernel threads are allowed on online && !a			 * Kernel threads are allowed on online && !a
		 * however, during cpu-hot-unplug, even these			 * however, during cpu-hot-unplug, even these
		 * away if not KTHREAD_IS_PER_CPU.				 * away if not KTHREAD_IS_PER_CPU.
		 *								 *
		 * Specifically, migration_disabled() tasks m			 * Specifically, migration_disabled() tasks m
		 * cpumask_any_and_distribute() pick below, e			 * cpumask_any_and_distribute() pick below, e
		 * SCA_MIGRATE_ENABLE, otherwise we'll not ca			 * SCA_MIGRATE_ENABLE, otherwise we'll not ca
		 * set_cpus_allowed_common() and actually res			 * set_cpus_allowed_common() and actually res
		 */								 */
		cpu_valid_mask = cpu_online_mask;				cpu_valid_mask = cpu_online_mask;
	}								}

	if (!kthread && !cpumask_subset(ctx->new_mask, cpu_al		if (!kthread && !cpumask_subset(ctx->new_mask, cpu_al
		ret = -EINVAL;							ret = -EINVAL;
		goto out;							goto out;
	}								}

	/*								/*
	 * Must re-check here, to close a race against __kthr		 * Must re-check here, to close a race against __kthr
	 * sched_setaffinity() is not guaranteed to observe t		 * sched_setaffinity() is not guaranteed to observe t
	 */								 */
	if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SET		if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SET
		ret = -EINVAL;							ret = -EINVAL;
		goto out;							goto out;
	}								}

	if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {			if (!(ctx->flags & SCA_MIGRATE_ENABLE)) {
		if (cpumask_equal(&p->cpus_mask, ctx->new_mas			if (cpumask_equal(&p->cpus_mask, ctx->new_mas
			if (ctx->flags & SCA_USER)					if (ctx->flags & SCA_USER)
				swap(p->user_cpus_ptr, ctx->u					swap(p->user_cpus_ptr, ctx->u
			goto out;							goto out;
		}								}

		if (WARN_ON_ONCE(p == current &&				if (WARN_ON_ONCE(p == current &&
				 is_migration_disabled(p) &&					 is_migration_disabled(p) &&
				 !cpumask_test_cpu(task_cpu(p					 !cpumask_test_cpu(task_cpu(p
			ret = -EBUSY;							ret = -EBUSY;
			goto out;							goto out;
		}								}
	}								}

	/*								/*
	 * Picking a ~random cpu helps in cases where we are 		 * Picking a ~random cpu helps in cases where we are 
	 * for groups of tasks (ie. cpuset), so that load bal		 * for groups of tasks (ie. cpuset), so that load bal
	 * immediately required to distribute the tasks withi		 * immediately required to distribute the tasks withi
	 */								 */
	dest_cpu = cpumask_any_and_distribute(cpu_valid_mask,		dest_cpu = cpumask_any_and_distribute(cpu_valid_mask,
	if (dest_cpu >= nr_cpu_ids) {					if (dest_cpu >= nr_cpu_ids) {
		ret = -EINVAL;							ret = -EINVAL;
		goto out;							goto out;
	}								}

	__do_set_cpus_allowed(p, ctx);					__do_set_cpus_allowed(p, ctx);

	return affine_move_task(rq, p, rf, dest_cpu, ctx->fla		return affine_move_task(rq, p, rf, dest_cpu, ctx->fla

out:								out:
	task_rq_unlock(rq, p, rf);					task_rq_unlock(rq, p, rf);

	return ret;							return ret;
}								}

/*								/*
 * Change a given task's CPU affinity. Migrate the thread to 	 * Change a given task's CPU affinity. Migrate the thread to 
 * proper CPU and schedule it away if the CPU it's executing 	 * proper CPU and schedule it away if the CPU it's executing 
 * is removed from the allowed bitmask.				 * is removed from the allowed bitmask.
 *								 *
 * NOTE: the caller must have a valid reference to the task, 	 * NOTE: the caller must have a valid reference to the task, 
 * task must not exit() & deallocate itself prematurely. The	 * task must not exit() & deallocate itself prematurely. The
 * call is not atomic; no spinlocks may be held.		 * call is not atomic; no spinlocks may be held.
 */								 */
static int __set_cpus_allowed_ptr(struct task_struct *p,	static int __set_cpus_allowed_ptr(struct task_struct *p,
				  struct affinity_context *ct					  struct affinity_context *ct
{								{
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
	/*								/*
	 * Masking should be skipped if SCA_USER or any of th		 * Masking should be skipped if SCA_USER or any of th
	 * flags are set.						 * flags are set.
	 */								 */
	if (p->user_cpus_ptr &&						if (p->user_cpus_ptr &&
	    !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | S		    !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | S
	    cpumask_and(rq->scratch_mask, ctx->new_mask, p->u		    cpumask_and(rq->scratch_mask, ctx->new_mask, p->u
		ctx->new_mask = rq->scratch_mask;				ctx->new_mask = rq->scratch_mask;

	return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf)		return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf)
}								}

int set_cpus_allowed_ptr(struct task_struct *p, const struct 	int set_cpus_allowed_ptr(struct task_struct *p, const struct 
{								{
	struct affinity_context ac = {					struct affinity_context ac = {
		.new_mask  = new_mask,						.new_mask  = new_mask,
		.flags     = 0,							.flags     = 0,
	};								};

	return __set_cpus_allowed_ptr(p, &ac);				return __set_cpus_allowed_ptr(p, &ac);
}								}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);			EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);

/*								/*
 * Change a given task's CPU affinity to the intersection of 	 * Change a given task's CPU affinity to the intersection of 
 * affinity mask and @subset_mask, writing the resulting mask	 * affinity mask and @subset_mask, writing the resulting mask
 * If user_cpus_ptr is defined, use it as the basis for restr	 * If user_cpus_ptr is defined, use it as the basis for restr
 * affinity or use cpu_online_mask instead.			 * affinity or use cpu_online_mask instead.
 *								 *
 * If the resulting mask is empty, leave the affinity unchang	 * If the resulting mask is empty, leave the affinity unchang
 * -EINVAL.							 * -EINVAL.
 */								 */
static int restrict_cpus_allowed_ptr(struct task_struct *p,	static int restrict_cpus_allowed_ptr(struct task_struct *p,
				     struct cpumask *new_mask					     struct cpumask *new_mask
				     const struct cpumask *su					     const struct cpumask *su
{								{
	struct affinity_context ac = {					struct affinity_context ac = {
		.new_mask  = new_mask,						.new_mask  = new_mask,
		.flags     = 0,							.flags     = 0,
	};								};
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;
	int err;							int err;

	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);

	/*								/*
	 * Forcefully restricting the affinity of a deadline 		 * Forcefully restricting the affinity of a deadline 
	 * likely to cause problems, so fail and noisily over		 * likely to cause problems, so fail and noisily over
	 * mask entirely.						 * mask entirely.
	 */								 */
	if (task_has_dl_policy(p) && dl_bandwidth_enabled()) 		if (task_has_dl_policy(p) && dl_bandwidth_enabled()) 
		err = -EPERM;							err = -EPERM;
		goto err_unlock;						goto err_unlock;
	}								}

	if (!cpumask_and(new_mask, task_user_cpus(p), subset_		if (!cpumask_and(new_mask, task_user_cpus(p), subset_
		err = -EINVAL;							err = -EINVAL;
		goto err_unlock;						goto err_unlock;
	}								}

	return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf)		return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf)

err_unlock:							err_unlock:
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
	return err;							return err;
}								}

/*								/*
 * Restrict the CPU affinity of task @p so that it is a subse	 * Restrict the CPU affinity of task @p so that it is a subse
 * task_cpu_possible_mask() and point @p->user_cpus_ptr to a 	 * task_cpu_possible_mask() and point @p->user_cpus_ptr to a 
 * old affinity mask. If the resulting mask is empty, we warn	 * old affinity mask. If the resulting mask is empty, we warn
 * up the cpuset hierarchy until we find a suitable mask.	 * up the cpuset hierarchy until we find a suitable mask.
 */								 */
void force_compatible_cpus_allowed_ptr(struct task_struct *p)	void force_compatible_cpus_allowed_ptr(struct task_struct *p)
{								{
	cpumask_var_t new_mask;						cpumask_var_t new_mask;
	const struct cpumask *override_mask = task_cpu_possib		const struct cpumask *override_mask = task_cpu_possib

	alloc_cpumask_var(&new_mask, GFP_KERNEL);			alloc_cpumask_var(&new_mask, GFP_KERNEL);

	/*								/*
	 * __migrate_task() can fail silently in the face of 		 * __migrate_task() can fail silently in the face of 
	 * offlining of the chosen destination CPU, so take t		 * offlining of the chosen destination CPU, so take t
	 * lock to ensure that the migration succeeds.			 * lock to ensure that the migration succeeds.
	 */								 */
	cpus_read_lock();						cpus_read_lock();
	if (!cpumask_available(new_mask))				if (!cpumask_available(new_mask))
		goto out_set_mask;						goto out_set_mask;

	if (!restrict_cpus_allowed_ptr(p, new_mask, override_		if (!restrict_cpus_allowed_ptr(p, new_mask, override_
		goto out_free_mask;						goto out_free_mask;

	/*								/*
	 * We failed to find a valid subset of the affinity m		 * We failed to find a valid subset of the affinity m
	 * task, so override it based on its cpuset hierarchy		 * task, so override it based on its cpuset hierarchy
	 */								 */
	cpuset_cpus_allowed(p, new_mask);				cpuset_cpus_allowed(p, new_mask);
	override_mask = new_mask;					override_mask = new_mask;

out_set_mask:							out_set_mask:
	if (printk_ratelimit()) {					if (printk_ratelimit()) {
		printk_deferred("Overriding affinity for proc			printk_deferred("Overriding affinity for proc
				task_pid_nr(p), p->comm,					task_pid_nr(p), p->comm,
				cpumask_pr_args(override_mask					cpumask_pr_args(override_mask
	}								}

	WARN_ON(set_cpus_allowed_ptr(p, override_mask));		WARN_ON(set_cpus_allowed_ptr(p, override_mask));
out_free_mask:							out_free_mask:
	cpus_read_unlock();						cpus_read_unlock();
	free_cpumask_var(new_mask);					free_cpumask_var(new_mask);
}								}

static int							static int
__sched_setaffinity(struct task_struct *p, struct affinity_co	__sched_setaffinity(struct task_struct *p, struct affinity_co

/*								/*
 * Restore the affinity of a task @p which was previously res	 * Restore the affinity of a task @p which was previously res
 * call to force_compatible_cpus_allowed_ptr().			 * call to force_compatible_cpus_allowed_ptr().
 *								 *
 * It is the caller's responsibility to serialise this with a	 * It is the caller's responsibility to serialise this with a
 * force_compatible_cpus_allowed_ptr(@p).			 * force_compatible_cpus_allowed_ptr(@p).
 */								 */
void relax_compatible_cpus_allowed_ptr(struct task_struct *p)	void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
{								{
	struct affinity_context ac = {					struct affinity_context ac = {
		.new_mask  = task_user_cpus(p),					.new_mask  = task_user_cpus(p),
		.flags     = 0,							.flags     = 0,
	};								};
	int ret;							int ret;

	/*								/*
	 * Try to restore the old affinity mask with __sched_		 * Try to restore the old affinity mask with __sched_
	 * Cpuset masking will be done there too.			 * Cpuset masking will be done there too.
	 */								 */
	ret = __sched_setaffinity(p, &ac);				ret = __sched_setaffinity(p, &ac);
	WARN_ON_ONCE(ret);						WARN_ON_ONCE(ret);
}								}

void set_task_cpu(struct task_struct *p, unsigned int new_cpu	void set_task_cpu(struct task_struct *p, unsigned int new_cpu
{								{
#ifdef CONFIG_SCHED_DEBUG					#ifdef CONFIG_SCHED_DEBUG
	unsigned int state = READ_ONCE(p->__state);			unsigned int state = READ_ONCE(p->__state);

	/*								/*
	 * We should never call set_task_cpu() on a blocked t		 * We should never call set_task_cpu() on a blocked t
	 * ttwu() will sort out the placement.				 * ttwu() will sort out the placement.
	 */								 */
	WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_W		WARN_ON_ONCE(state != TASK_RUNNING && state != TASK_W

	/*								/*
	 * Migrating fair class task must have p->on_rq = TAS		 * Migrating fair class task must have p->on_rq = TAS
	 * because schedstat_wait_{start,end} rebase migratin		 * because schedstat_wait_{start,end} rebase migratin
	 * time relying on p->on_rq.					 * time relying on p->on_rq.
	 */								 */
	WARN_ON_ONCE(state == TASK_RUNNING &&				WARN_ON_ONCE(state == TASK_RUNNING &&
		     p->sched_class == &fair_sched_class &&			     p->sched_class == &fair_sched_class &&
		     (p->on_rq && !task_on_rq_migrating(p)));			     (p->on_rq && !task_on_rq_migrating(p)));

#ifdef CONFIG_LOCKDEP						#ifdef CONFIG_LOCKDEP
	/*								/*
	 * The caller should hold either p->pi_lock or rq->lo		 * The caller should hold either p->pi_lock or rq->lo
	 * a task's CPU. ->pi_lock for waking tasks, rq->lock		 * a task's CPU. ->pi_lock for waking tasks, rq->lock
	 *								 *
	 * sched_move_task() holds both and thus holding eith		 * sched_move_task() holds both and thus holding eith
	 * see task_group().						 * see task_group().
	 *								 *
	 * Furthermore, all task_rq users should acquire both		 * Furthermore, all task_rq users should acquire both
	 * task_rq_lock().						 * task_rq_lock().
	 */								 */
	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_		WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_
				      lockdep_is_held(__rq_lo					      lockdep_is_held(__rq_lo
#endif								#endif
	/*								/*
	 * Clearly, migrating tasks to offline CPUs is a fair		 * Clearly, migrating tasks to offline CPUs is a fair
	 */								 */
	WARN_ON_ONCE(!cpu_online(new_cpu));				WARN_ON_ONCE(!cpu_online(new_cpu));

	WARN_ON_ONCE(is_migration_disabled(p));				WARN_ON_ONCE(is_migration_disabled(p));
#endif								#endif

	trace_sched_migrate_task(p, new_cpu);				trace_sched_migrate_task(p, new_cpu);

	if (task_cpu(p) != new_cpu) {					if (task_cpu(p) != new_cpu) {
		if (p->sched_class->migrate_task_rq)				if (p->sched_class->migrate_task_rq)
			p->sched_class->migrate_task_rq(p, ne				p->sched_class->migrate_task_rq(p, ne
		p->se.nr_migrations++;						p->se.nr_migrations++;
		rseq_migrate(p);						rseq_migrate(p);
		sched_mm_cid_migrate_from(p);					sched_mm_cid_migrate_from(p);
		perf_event_task_migrate(p);					perf_event_task_migrate(p);
	}								}

	__set_task_cpu(p, new_cpu);					__set_task_cpu(p, new_cpu);
}								}

#ifdef CONFIG_NUMA_BALANCING					#ifdef CONFIG_NUMA_BALANCING
static void __migrate_swap_task(struct task_struct *p, int cp	static void __migrate_swap_task(struct task_struct *p, int cp
{								{
	if (task_on_rq_queued(p)) {					if (task_on_rq_queued(p)) {
		struct rq *src_rq, *dst_rq;					struct rq *src_rq, *dst_rq;
		struct rq_flags srf, drf;					struct rq_flags srf, drf;

		src_rq = task_rq(p);						src_rq = task_rq(p);
		dst_rq = cpu_rq(cpu);						dst_rq = cpu_rq(cpu);

		rq_pin_lock(src_rq, &srf);					rq_pin_lock(src_rq, &srf);
		rq_pin_lock(dst_rq, &drf);					rq_pin_lock(dst_rq, &drf);

		deactivate_task(src_rq, p, 0);					deactivate_task(src_rq, p, 0);
		set_task_cpu(p, cpu);						set_task_cpu(p, cpu);
		activate_task(dst_rq, p, 0);					activate_task(dst_rq, p, 0);
		check_preempt_curr(dst_rq, p, 0);				check_preempt_curr(dst_rq, p, 0);

		rq_unpin_lock(dst_rq, &drf);					rq_unpin_lock(dst_rq, &drf);
		rq_unpin_lock(src_rq, &srf);					rq_unpin_lock(src_rq, &srf);

	} else {							} else {
		/*								/*
		 * Task isn't running anymore; make it appear			 * Task isn't running anymore; make it appear
		 * it before it went to sleep. This means on 			 * it before it went to sleep. This means on 
		 * previous CPU our target instead of where i			 * previous CPU our target instead of where i
		 */								 */
		p->wake_cpu = cpu;						p->wake_cpu = cpu;
	}								}
}								}

struct migration_swap_arg {					struct migration_swap_arg {
	struct task_struct *src_task, *dst_task;			struct task_struct *src_task, *dst_task;
	int src_cpu, dst_cpu;						int src_cpu, dst_cpu;
};								};

static int migrate_swap_stop(void *data)			static int migrate_swap_stop(void *data)
{								{
	struct migration_swap_arg *arg = data;				struct migration_swap_arg *arg = data;
	struct rq *src_rq, *dst_rq;					struct rq *src_rq, *dst_rq;

	if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst		if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst
		return -EAGAIN;							return -EAGAIN;

	src_rq = cpu_rq(arg->src_cpu);					src_rq = cpu_rq(arg->src_cpu);
	dst_rq = cpu_rq(arg->dst_cpu);					dst_rq = cpu_rq(arg->dst_cpu);

	guard(double_raw_spinlock)(&arg->src_task->pi_lock, &		guard(double_raw_spinlock)(&arg->src_task->pi_lock, &
	guard(double_rq_lock)(src_rq, dst_rq);				guard(double_rq_lock)(src_rq, dst_rq);

	if (task_cpu(arg->dst_task) != arg->dst_cpu)			if (task_cpu(arg->dst_task) != arg->dst_cpu)
		return -EAGAIN;							return -EAGAIN;

	if (task_cpu(arg->src_task) != arg->src_cpu)			if (task_cpu(arg->src_task) != arg->src_cpu)
		return -EAGAIN;							return -EAGAIN;

	if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cp		if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cp
		return -EAGAIN;							return -EAGAIN;

	if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cp		if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cp
		return -EAGAIN;							return -EAGAIN;

	__migrate_swap_task(arg->src_task, arg->dst_cpu);		__migrate_swap_task(arg->src_task, arg->dst_cpu);
	__migrate_swap_task(arg->dst_task, arg->src_cpu);		__migrate_swap_task(arg->dst_task, arg->src_cpu);

	return 0;							return 0;
}								}

/*								/*
 * Cross migrate two tasks					 * Cross migrate two tasks
 */								 */
int migrate_swap(struct task_struct *cur, struct task_struct 	int migrate_swap(struct task_struct *cur, struct task_struct 
		int target_cpu, int curr_cpu)					int target_cpu, int curr_cpu)
{								{
	struct migration_swap_arg arg;					struct migration_swap_arg arg;
	int ret = -EINVAL;						int ret = -EINVAL;

	arg = (struct migration_swap_arg){				arg = (struct migration_swap_arg){
		.src_task = cur,						.src_task = cur,
		.src_cpu = curr_cpu,						.src_cpu = curr_cpu,
		.dst_task = p,							.dst_task = p,
		.dst_cpu = target_cpu,						.dst_cpu = target_cpu,
	};								};

	if (arg.src_cpu == arg.dst_cpu)					if (arg.src_cpu == arg.dst_cpu)
		goto out;							goto out;

	/*								/*
	 * These three tests are all lockless; this is OK sin		 * These three tests are all lockless; this is OK sin
	 * will be re-checked with proper locks held further 		 * will be re-checked with proper locks held further 
	 */								 */
	if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_c		if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_c
		goto out;							goto out;

	if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus		if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus
		goto out;							goto out;

	if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus		if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus
		goto out;							goto out;

	trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cp		trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cp
	ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate		ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate

out:								out:
	return ret;							return ret;
}								}
#endif /* CONFIG_NUMA_BALANCING */				#endif /* CONFIG_NUMA_BALANCING */

/***								/***
 * kick_process - kick a running thread to enter/exit the ker	 * kick_process - kick a running thread to enter/exit the ker
 * @p: the to-be-kicked thread					 * @p: the to-be-kicked thread
 *								 *
 * Cause a process which is running on another CPU to enter	 * Cause a process which is running on another CPU to enter
 * kernel-mode, without any delay. (to get signals handled.)	 * kernel-mode, without any delay. (to get signals handled.)
 *								 *
 * NOTE: this function doesn't have to take the runqueue lock	 * NOTE: this function doesn't have to take the runqueue lock
 * because all it wants to ensure is that the remote task ent	 * because all it wants to ensure is that the remote task ent
 * the kernel. If the IPI races and the task has been migrate	 * the kernel. If the IPI races and the task has been migrate
 * to another CPU then no harm is done and the purpose has be	 * to another CPU then no harm is done and the purpose has be
 * achieved as well.						 * achieved as well.
 */								 */
void kick_process(struct task_struct *p)			void kick_process(struct task_struct *p)
{								{
	int cpu;							int cpu;

	preempt_disable();						preempt_disable();
	cpu = task_cpu(p);						cpu = task_cpu(p);
	if ((cpu != smp_processor_id()) && task_curr(p))		if ((cpu != smp_processor_id()) && task_curr(p))
		smp_send_reschedule(cpu);					smp_send_reschedule(cpu);
	preempt_enable();						preempt_enable();
}								}
EXPORT_SYMBOL_GPL(kick_process);				EXPORT_SYMBOL_GPL(kick_process);

/*								/*
 * ->cpus_ptr is protected by both rq->lock and p->pi_lock	 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
 *								 *
 * A few notes on cpu_active vs cpu_online:			 * A few notes on cpu_active vs cpu_online:
 *								 *
 *  - cpu_active must be a subset of cpu_online			 *  - cpu_active must be a subset of cpu_online
 *								 *
 *  - on CPU-up we allow per-CPU kthreads on the online && !a	 *  - on CPU-up we allow per-CPU kthreads on the online && !a
 *    see __set_cpus_allowed_ptr(). At this point the newly o	 *    see __set_cpus_allowed_ptr(). At this point the newly o
 *    CPU isn't yet part of the sched domains, and balancing 	 *    CPU isn't yet part of the sched domains, and balancing 
 *    see it.							 *    see it.
 *								 *
 *  - on CPU-down we clear cpu_active() to mask the sched dom	 *  - on CPU-down we clear cpu_active() to mask the sched dom
 *    avoid the load balancer to place new tasks on the to be	 *    avoid the load balancer to place new tasks on the to be
 *    CPU. Existing tasks will remain running there and will 	 *    CPU. Existing tasks will remain running there and will 
 *    off.							 *    off.
 *								 *
 * This means that fallback selection must not select !active	 * This means that fallback selection must not select !active
 * And can assume that any active CPU must be online. Convers	 * And can assume that any active CPU must be online. Convers
 * select_task_rq() below may allow selection of !active CPUs	 * select_task_rq() below may allow selection of !active CPUs
 * to satisfy the above rules.					 * to satisfy the above rules.
 */								 */
static int select_fallback_rq(int cpu, struct task_struct *p)	static int select_fallback_rq(int cpu, struct task_struct *p)
{								{
	int nid = cpu_to_node(cpu);					int nid = cpu_to_node(cpu);
	const struct cpumask *nodemask = NULL;				const struct cpumask *nodemask = NULL;
	enum { cpuset, possible, fail } state = cpuset;			enum { cpuset, possible, fail } state = cpuset;
	int dest_cpu;							int dest_cpu;

	/*								/*
	 * If the node that the CPU is on has been offlined, 		 * If the node that the CPU is on has been offlined, 
	 * will return -1. There is no CPU on the node, and w		 * will return -1. There is no CPU on the node, and w
	 * select the CPU on the other node.				 * select the CPU on the other node.
	 */								 */
	if (nid != -1) {						if (nid != -1) {
		nodemask = cpumask_of_node(nid);				nodemask = cpumask_of_node(nid);

		/* Look for allowed, online CPU in same node.			/* Look for allowed, online CPU in same node.
		for_each_cpu(dest_cpu, nodemask) {				for_each_cpu(dest_cpu, nodemask) {
			if (is_cpu_allowed(p, dest_cpu))				if (is_cpu_allowed(p, dest_cpu))
				return dest_cpu;						return dest_cpu;
		}								}
	}								}

	for (;;) {							for (;;) {
		/* Any allowed, online CPU? */					/* Any allowed, online CPU? */
		for_each_cpu(dest_cpu, p->cpus_ptr) {				for_each_cpu(dest_cpu, p->cpus_ptr) {
			if (!is_cpu_allowed(p, dest_cpu))				if (!is_cpu_allowed(p, dest_cpu))
				continue;							continue;

			goto out;							goto out;
		}								}

		/* No more Mr. Nice Guy. */					/* No more Mr. Nice Guy. */
		switch (state) {						switch (state) {
		case cpuset:							case cpuset:
			if (cpuset_cpus_allowed_fallback(p)) 				if (cpuset_cpus_allowed_fallback(p)) 
				state = possible;						state = possible;
				break;								break;
			}								}
			fallthrough;							fallthrough;
		case possible:							case possible:
			/*								/*
			 * XXX When called from select_task_r				 * XXX When called from select_task_r
			 * hold p->pi_lock and again violate 				 * hold p->pi_lock and again violate 
			 *								 *
			 * More yuck to audit.						 * More yuck to audit.
			 */								 */
			do_set_cpus_allowed(p, task_cpu_possi				do_set_cpus_allowed(p, task_cpu_possi
			state = fail;							state = fail;
			break;								break;
		case fail:							case fail:
			BUG();								BUG();
			break;								break;
		}								}
	}								}

out:								out:
	if (state != cpuset) {						if (state != cpuset) {
		/*								/*
		 * Don't tell them about moving exiting tasks			 * Don't tell them about moving exiting tasks
		 * kernel threads (both mm NULL), since they 			 * kernel threads (both mm NULL), since they 
		 * leave kernel.						 * leave kernel.
		 */								 */
		if (p->mm && printk_ratelimit()) {				if (p->mm && printk_ratelimit()) {
			printk_deferred("process %d (%s) no l				printk_deferred("process %d (%s) no l
					task_pid_nr(p), p->co						task_pid_nr(p), p->co
		}								}
	}								}

	return dest_cpu;						return dest_cpu;
}								}

/*								/*
 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is s	 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is s
 */								 */
static inline							static inline
int select_task_rq(struct task_struct *p, int cpu, int wake_f	int select_task_rq(struct task_struct *p, int cpu, int wake_f
{								{
	lockdep_assert_held(&p->pi_lock);				lockdep_assert_held(&p->pi_lock);

	if (p->nr_cpus_allowed > 1 && !is_migration_disabled(		if (p->nr_cpus_allowed > 1 && !is_migration_disabled(
		cpu = p->sched_class->select_task_rq(p, cpu, 			cpu = p->sched_class->select_task_rq(p, cpu, 
	else								else
		cpu = cpumask_any(p->cpus_ptr);					cpu = cpumask_any(p->cpus_ptr);

	/*								/*
	 * In order not to call set_task_cpu() on a blocking 		 * In order not to call set_task_cpu() on a blocking 
	 * to rely on ttwu() to place the task on a valid ->c		 * to rely on ttwu() to place the task on a valid ->c
	 * CPU.								 * CPU.
	 *								 *
	 * Since this is common to all placement strategies, 		 * Since this is common to all placement strategies, 
	 *								 *
	 * [ this allows ->select_task() to simply return tas		 * [ this allows ->select_task() to simply return tas
	 *   not worry about this generic constraint ]			 *   not worry about this generic constraint ]
	 */								 */
	if (unlikely(!is_cpu_allowed(p, cpu)))				if (unlikely(!is_cpu_allowed(p, cpu)))
		cpu = select_fallback_rq(task_cpu(p), p);			cpu = select_fallback_rq(task_cpu(p), p);

	return cpu;							return cpu;
}								}

void sched_set_stop_task(int cpu, struct task_struct *stop)	void sched_set_stop_task(int cpu, struct task_struct *stop)
{								{
	static struct lock_class_key stop_pi_lock;			static struct lock_class_key stop_pi_lock;
	struct sched_param param = { .sched_priority = MAX_RT		struct sched_param param = { .sched_priority = MAX_RT
	struct task_struct *old_stop = cpu_rq(cpu)->stop;		struct task_struct *old_stop = cpu_rq(cpu)->stop;

	if (stop) {							if (stop) {
		/*								/*
		 * Make it appear like a SCHED_FIFO task, its			 * Make it appear like a SCHED_FIFO task, its
		 * userspace knows about and won't get confus			 * userspace knows about and won't get confus
		 *								 *
		 * Also, it will make PI more or less work wi			 * Also, it will make PI more or less work wi
		 * much confusion -- but then, stop work shou			 * much confusion -- but then, stop work shou
		 * rely on PI working anyway.					 * rely on PI working anyway.
		 */								 */
		sched_setscheduler_nocheck(stop, SCHED_FIFO, 			sched_setscheduler_nocheck(stop, SCHED_FIFO, 

		stop->sched_class = &stop_sched_class;				stop->sched_class = &stop_sched_class;

		/*								/*
		 * The PI code calls rt_mutex_setprio() with 			 * The PI code calls rt_mutex_setprio() with 
		 * adjust the effective priority of a task. A			 * adjust the effective priority of a task. A
		 * rt_mutex_setprio() can trigger (RT) balanc			 * rt_mutex_setprio() can trigger (RT) balanc
		 * which can then trigger wakeups of the stop			 * which can then trigger wakeups of the stop
		 * around the current task.					 * around the current task.
		 *								 *
		 * The stop task itself will never be part of			 * The stop task itself will never be part of
		 * never blocks, therefore that ->pi_lock rec			 * never blocks, therefore that ->pi_lock rec
		 * Tell lockdep about this by placing the sto			 * Tell lockdep about this by placing the sto
		 * own class.							 * own class.
		 */								 */
		lockdep_set_class(&stop->pi_lock, &stop_pi_lo			lockdep_set_class(&stop->pi_lock, &stop_pi_lo
	}								}

	cpu_rq(cpu)->stop = stop;					cpu_rq(cpu)->stop = stop;

	if (old_stop) {							if (old_stop) {
		/*								/*
		 * Reset it back to a normal scheduling class			 * Reset it back to a normal scheduling class
		 * it can die in pieces.					 * it can die in pieces.
		 */								 */
		old_stop->sched_class = &rt_sched_class;			old_stop->sched_class = &rt_sched_class;
	}								}
}								}

#else /* CONFIG_SMP */						#else /* CONFIG_SMP */

static inline int __set_cpus_allowed_ptr(struct task_struct *	static inline int __set_cpus_allowed_ptr(struct task_struct *
					 struct affinity_cont						 struct affinity_cont
{								{
	return set_cpus_allowed_ptr(p, ctx->new_mask);			return set_cpus_allowed_ptr(p, ctx->new_mask);
}								}

static inline void migrate_disable_switch(struct rq *rq, stru	static inline void migrate_disable_switch(struct rq *rq, stru

static inline bool rq_has_pinned_tasks(struct rq *rq)		static inline bool rq_has_pinned_tasks(struct rq *rq)
{								{
	return false;							return false;
}								}

static inline cpumask_t *alloc_user_cpus_ptr(int node)		static inline cpumask_t *alloc_user_cpus_ptr(int node)
{								{
	return NULL;							return NULL;
}								}

#endif /* !CONFIG_SMP */					#endif /* !CONFIG_SMP */

static void							static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)	ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{								{
	struct rq *rq;							struct rq *rq;

	if (!schedstat_enabled())					if (!schedstat_enabled())
		return;								return;

	rq = this_rq();							rq = this_rq();

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	if (cpu == rq->cpu) {						if (cpu == rq->cpu) {
		__schedstat_inc(rq->ttwu_local);				__schedstat_inc(rq->ttwu_local);
		__schedstat_inc(p->stats.nr_wakeups_local);			__schedstat_inc(p->stats.nr_wakeups_local);
	} else {							} else {
		struct sched_domain *sd;					struct sched_domain *sd;

		__schedstat_inc(p->stats.nr_wakeups_remote);			__schedstat_inc(p->stats.nr_wakeups_remote);

		guard(rcu)();							guard(rcu)();
		for_each_domain(rq->cpu, sd) {					for_each_domain(rq->cpu, sd) {
			if (cpumask_test_cpu(cpu, sched_domai				if (cpumask_test_cpu(cpu, sched_domai
				__schedstat_inc(sd->ttwu_wake					__schedstat_inc(sd->ttwu_wake
				break;								break;
			}								}
		}								}
	}								}

	if (wake_flags & WF_MIGRATED)					if (wake_flags & WF_MIGRATED)
		__schedstat_inc(p->stats.nr_wakeups_migrate);			__schedstat_inc(p->stats.nr_wakeups_migrate);
#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */

	__schedstat_inc(rq->ttwu_count);				__schedstat_inc(rq->ttwu_count);
	__schedstat_inc(p->stats.nr_wakeups);				__schedstat_inc(p->stats.nr_wakeups);

	if (wake_flags & WF_SYNC)					if (wake_flags & WF_SYNC)
		__schedstat_inc(p->stats.nr_wakeups_sync);			__schedstat_inc(p->stats.nr_wakeups_sync);
}								}

/*								/*
 * Mark the task runnable.					 * Mark the task runnable.
 */								 */
static inline void ttwu_do_wakeup(struct task_struct *p)	static inline void ttwu_do_wakeup(struct task_struct *p)
{								{
	WRITE_ONCE(p->__state, TASK_RUNNING);				WRITE_ONCE(p->__state, TASK_RUNNING);
	trace_sched_wakeup(p);						trace_sched_wakeup(p);
}								}

static void							static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wa	ttwu_do_activate(struct rq *rq, struct task_struct *p, int wa
		 struct rq_flags *rf)						 struct rq_flags *rf)
{								{
	int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;		int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	if (p->sched_contributes_to_load)				if (p->sched_contributes_to_load)
		rq->nr_uninterruptible--;					rq->nr_uninterruptible--;

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	if (wake_flags & WF_MIGRATED)					if (wake_flags & WF_MIGRATED)
		en_flags |= ENQUEUE_MIGRATED;					en_flags |= ENQUEUE_MIGRATED;
	else								else
#endif								#endif
	if (p->in_iowait) {						if (p->in_iowait) {
		delayacct_blkio_end(p);						delayacct_blkio_end(p);
		atomic_dec(&task_rq(p)->nr_iowait);				atomic_dec(&task_rq(p)->nr_iowait);
	}								}

	activate_task(rq, p, en_flags);					activate_task(rq, p, en_flags);
	check_preempt_curr(rq, p, wake_flags);				check_preempt_curr(rq, p, wake_flags);

	ttwu_do_wakeup(p);						ttwu_do_wakeup(p);

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	if (p->sched_class->task_woken) {				if (p->sched_class->task_woken) {
		/*								/*
		 * Our task @p is fully woken up and running;			 * Our task @p is fully woken up and running;
		 * drop the rq->lock, hereafter rq is only us			 * drop the rq->lock, hereafter rq is only us
		 */								 */
		rq_unpin_lock(rq, rf);						rq_unpin_lock(rq, rf);
		p->sched_class->task_woken(rq, p);				p->sched_class->task_woken(rq, p);
		rq_repin_lock(rq, rf);						rq_repin_lock(rq, rf);
	}								}

	if (rq->idle_stamp) {						if (rq->idle_stamp) {
		u64 delta = rq_clock(rq) - rq->idle_stamp;			u64 delta = rq_clock(rq) - rq->idle_stamp;
		u64 max = 2*rq->max_idle_balance_cost;				u64 max = 2*rq->max_idle_balance_cost;

		update_avg(&rq->avg_idle, delta);				update_avg(&rq->avg_idle, delta);

		if (rq->avg_idle > max)						if (rq->avg_idle > max)
			rq->avg_idle = max;						rq->avg_idle = max;

		rq->wake_stamp = jiffies;					rq->wake_stamp = jiffies;
		rq->wake_avg_idle = rq->avg_idle / 2;				rq->wake_avg_idle = rq->avg_idle / 2;

		rq->idle_stamp = 0;						rq->idle_stamp = 0;
	}								}
#endif								#endif
}								}

/*								/*
 * Consider @p being inside a wait loop:			 * Consider @p being inside a wait loop:
 *								 *
 *   for (;;) {							 *   for (;;) {
 *      set_current_state(TASK_UNINTERRUPTIBLE);		 *      set_current_state(TASK_UNINTERRUPTIBLE);
 *								 *
 *      if (CONDITION)						 *      if (CONDITION)
 *         break;						 *         break;
 *								 *
 *      schedule();						 *      schedule();
 *   }								 *   }
 *   __set_current_state(TASK_RUNNING);				 *   __set_current_state(TASK_RUNNING);
 *								 *
 * between set_current_state() and schedule(). In this case @	 * between set_current_state() and schedule(). In this case @
 * runnable, so all that needs doing is change p->state back 	 * runnable, so all that needs doing is change p->state back 
 * an atomic manner.						 * an atomic manner.
 *								 *
 * By taking task_rq(p)->lock we serialize against schedule()	 * By taking task_rq(p)->lock we serialize against schedule()
 * then schedule() must still happen and p->state can be chan	 * then schedule() must still happen and p->state can be chan
 * TASK_RUNNING. Otherwise we lost the race, schedule() has h	 * TASK_RUNNING. Otherwise we lost the race, schedule() has h
 * need to do a full wakeup with enqueue.			 * need to do a full wakeup with enqueue.
 *								 *
 * Returns: %true when the wakeup is done,			 * Returns: %true when the wakeup is done,
 *          %false otherwise.					 *          %false otherwise.
 */								 */
static int ttwu_runnable(struct task_struct *p, int wake_flag	static int ttwu_runnable(struct task_struct *p, int wake_flag
{								{
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;
	int ret = 0;							int ret = 0;

	rq = __task_rq_lock(p, &rf);					rq = __task_rq_lock(p, &rf);
	if (task_on_rq_queued(p)) {					if (task_on_rq_queued(p)) {
		if (!task_on_cpu(rq, p)) {					if (!task_on_cpu(rq, p)) {
			/*								/*
			 * When on_rq && !on_cpu the task is 				 * When on_rq && !on_cpu the task is 
			 * it should preempt the task that is				 * it should preempt the task that is
			 */								 */
			update_rq_clock(rq);						update_rq_clock(rq);
			check_preempt_curr(rq, p, wake_flags)				check_preempt_curr(rq, p, wake_flags)
		}								}
		ttwu_do_wakeup(p);						ttwu_do_wakeup(p);
		ret = 1;							ret = 1;
	}								}
	__task_rq_unlock(rq, &rf);					__task_rq_unlock(rq, &rf);

	return ret;							return ret;
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
void sched_ttwu_pending(void *arg)				void sched_ttwu_pending(void *arg)
{								{
	struct llist_node *llist = arg;					struct llist_node *llist = arg;
	struct rq *rq = this_rq();					struct rq *rq = this_rq();
	struct task_struct *p, *t;					struct task_struct *p, *t;
	struct rq_flags rf;						struct rq_flags rf;

	if (!llist)							if (!llist)
		return;								return;

	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	update_rq_clock(rq);						update_rq_clock(rq);

	llist_for_each_entry_safe(p, t, llist, wake_entry.lli		llist_for_each_entry_safe(p, t, llist, wake_entry.lli
		if (WARN_ON_ONCE(p->on_cpu))					if (WARN_ON_ONCE(p->on_cpu))
			smp_cond_load_acquire(&p->on_cpu, !VA				smp_cond_load_acquire(&p->on_cpu, !VA

		if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))			if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq)))
			set_task_cpu(p, cpu_of(rq));					set_task_cpu(p, cpu_of(rq));

		ttwu_do_activate(rq, p, p->sched_remote_wakeu			ttwu_do_activate(rq, p, p->sched_remote_wakeu
	}								}

	/*								/*
	 * Must be after enqueueing at least once task such t		 * Must be after enqueueing at least once task such t
	 * idle_cpu() does not observe a false-negative -- if		 * idle_cpu() does not observe a false-negative -- if
	 * it is possible for select_idle_siblings() to stack		 * it is possible for select_idle_siblings() to stack
	 * of tasks on this CPU during that window.			 * of tasks on this CPU during that window.
	 *								 *
	 * It is ok to clear ttwu_pending when another task p		 * It is ok to clear ttwu_pending when another task p
	 * We will receive IPI after local irq enabled and th		 * We will receive IPI after local irq enabled and th
	 * Since now nr_running > 0, idle_cpu() will always g		 * Since now nr_running > 0, idle_cpu() will always g
	 */								 */
	WRITE_ONCE(rq->ttwu_pending, 0);				WRITE_ONCE(rq->ttwu_pending, 0);
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);
}								}

/*								/*
 * Prepare the scene for sending an IPI for a remote smp_call	 * Prepare the scene for sending an IPI for a remote smp_call
 *								 *
 * Returns true if the caller can proceed with sending the IP	 * Returns true if the caller can proceed with sending the IP
 * Returns false otherwise.					 * Returns false otherwise.
 */								 */
bool call_function_single_prep_ipi(int cpu)			bool call_function_single_prep_ipi(int cpu)
{								{
	if (set_nr_if_polling(cpu_rq(cpu)->idle)) {			if (set_nr_if_polling(cpu_rq(cpu)->idle)) {
		trace_sched_wake_idle_without_ipi(cpu);				trace_sched_wake_idle_without_ipi(cpu);
		return false;							return false;
	}								}

	return true;							return true;
}								}

/*								/*
 * Queue a task on the target CPUs wake_list and wake the CPU	 * Queue a task on the target CPUs wake_list and wake the CPU
 * necessary. The wakee CPU on receipt of the IPI will queue 	 * necessary. The wakee CPU on receipt of the IPI will queue 
 * via sched_ttwu_wakeup() for activation so the wakee incurs	 * via sched_ttwu_wakeup() for activation so the wakee incurs
 * of the wakeup instead of the waker.				 * of the wakeup instead of the waker.
 */								 */
static void __ttwu_queue_wakelist(struct task_struct *p, int 	static void __ttwu_queue_wakelist(struct task_struct *p, int 
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED)		p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED)

	WRITE_ONCE(rq->ttwu_pending, 1);				WRITE_ONCE(rq->ttwu_pending, 1);
	__smp_call_single_queue(cpu, &p->wake_entry.llist);		__smp_call_single_queue(cpu, &p->wake_entry.llist);
}								}

void wake_up_if_idle(int cpu)					void wake_up_if_idle(int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	guard(rcu)();							guard(rcu)();
	if (is_idle_task(rcu_dereference(rq->curr))) {			if (is_idle_task(rcu_dereference(rq->curr))) {
		guard(rq_lock_irqsave)(rq);					guard(rq_lock_irqsave)(rq);
		if (is_idle_task(rq->curr))					if (is_idle_task(rq->curr))
			resched_curr(rq);						resched_curr(rq);
	}								}
}								}

bool cpus_share_cache(int this_cpu, int that_cpu)		bool cpus_share_cache(int this_cpu, int that_cpu)
{								{
	if (this_cpu == that_cpu)					if (this_cpu == that_cpu)
		return true;							return true;

	return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc		return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc
}								}

static inline bool ttwu_queue_cond(struct task_struct *p, int	static inline bool ttwu_queue_cond(struct task_struct *p, int
{								{
	/*								/*
	 * Do not complicate things with the async wake_list 		 * Do not complicate things with the async wake_list 
	 * in hotplug state.						 * in hotplug state.
	 */								 */
	if (!cpu_active(cpu))						if (!cpu_active(cpu))
		return false;							return false;

	/* Ensure the task will still be allowed to run on th		/* Ensure the task will still be allowed to run on th
	if (!cpumask_test_cpu(cpu, p->cpus_ptr))			if (!cpumask_test_cpu(cpu, p->cpus_ptr))
		return false;							return false;

	/*								/*
	 * If the CPU does not share cache, then queue the ta		 * If the CPU does not share cache, then queue the ta
	 * remote rqs wakelist to avoid accessing remote data		 * remote rqs wakelist to avoid accessing remote data
	 */								 */
	if (!cpus_share_cache(smp_processor_id(), cpu))			if (!cpus_share_cache(smp_processor_id(), cpu))
		return true;							return true;

	if (cpu == smp_processor_id())					if (cpu == smp_processor_id())
		return false;							return false;

	/*								/*
	 * If the wakee cpu is idle, or the task is deschedul		 * If the wakee cpu is idle, or the task is deschedul
	 * only running task on the CPU, then use the wakelis		 * only running task on the CPU, then use the wakelis
	 * the task activation to the idle (or soon-to-be-idl		 * the task activation to the idle (or soon-to-be-idl
	 * the current CPU is likely busy. nr_running is chec		 * the current CPU is likely busy. nr_running is chec
	 * avoid unnecessary task stacking.				 * avoid unnecessary task stacking.
	 *								 *
	 * Note that we can only get here with (wakee) p->on_		 * Note that we can only get here with (wakee) p->on_
	 * p->on_cpu can be whatever, we've done the dequeue,		 * p->on_cpu can be whatever, we've done the dequeue,
	 * the wakee has been accounted out of ->nr_running.		 * the wakee has been accounted out of ->nr_running.
	 */								 */
	if (!cpu_rq(cpu)->nr_running)					if (!cpu_rq(cpu)->nr_running)
		return true;							return true;

	return false;							return false;
}								}

static bool ttwu_queue_wakelist(struct task_struct *p, int cp	static bool ttwu_queue_wakelist(struct task_struct *p, int cp
{								{
	if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)		if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(p, cpu)
		sched_clock_cpu(cpu); /* Sync clocks across C			sched_clock_cpu(cpu); /* Sync clocks across C
		__ttwu_queue_wakelist(p, cpu, wake_flags);			__ttwu_queue_wakelist(p, cpu, wake_flags);
		return true;							return true;
	}								}

	return false;							return false;
}								}

#else /* !CONFIG_SMP */						#else /* !CONFIG_SMP */

static inline bool ttwu_queue_wakelist(struct task_struct *p,	static inline bool ttwu_queue_wakelist(struct task_struct *p,
{								{
	return false;							return false;
}								}

#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */

static void ttwu_queue(struct task_struct *p, int cpu, int wa	static void ttwu_queue(struct task_struct *p, int cpu, int wa
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct rq_flags rf;						struct rq_flags rf;

	if (ttwu_queue_wakelist(p, cpu, wake_flags))			if (ttwu_queue_wakelist(p, cpu, wake_flags))
		return;								return;

	rq_lock(rq, &rf);						rq_lock(rq, &rf);
	update_rq_clock(rq);						update_rq_clock(rq);
	ttwu_do_activate(rq, p, wake_flags, &rf);			ttwu_do_activate(rq, p, wake_flags, &rf);
	rq_unlock(rq, &rf);						rq_unlock(rq, &rf);
}								}

/*								/*
 * Invoked from try_to_wake_up() to check whether the task ca	 * Invoked from try_to_wake_up() to check whether the task ca
 *								 *
 * The caller holds p::pi_lock if p != current or has preempt	 * The caller holds p::pi_lock if p != current or has preempt
 * disabled when p == current.					 * disabled when p == current.
 *								 *
 * The rules of PREEMPT_RT saved_state:				 * The rules of PREEMPT_RT saved_state:
 *								 *
 *   The related locking code always holds p::pi_lock when up	 *   The related locking code always holds p::pi_lock when up
 *   p::saved_state, which means the code is fully serialized	 *   p::saved_state, which means the code is fully serialized
 *								 *
 *   The lock wait and lock wakeups happen via TASK_RTLOCK_WA	 *   The lock wait and lock wakeups happen via TASK_RTLOCK_WA
 *   bits set. This allows to distinguish all wakeup scenario	 *   bits set. This allows to distinguish all wakeup scenario
 */								 */
static __always_inline						static __always_inline
bool ttwu_state_match(struct task_struct *p, unsigned int sta	bool ttwu_state_match(struct task_struct *p, unsigned int sta
{								{
	int match;							int match;

	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {				if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
		WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&			WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
			     state != TASK_RTLOCK_WAIT);				     state != TASK_RTLOCK_WAIT);
	}								}

	*success = !!(match = __task_state_match(p, state));		*success = !!(match = __task_state_match(p, state));

#ifdef CONFIG_PREEMPT_RT					#ifdef CONFIG_PREEMPT_RT
	/*								/*
	 * Saved state preserves the task state across blocki		 * Saved state preserves the task state across blocki
	 * an RT lock.  If the state matches, set p::saved_st		 * an RT lock.  If the state matches, set p::saved_st
	 * TASK_RUNNING, but do not wake the task because it 		 * TASK_RUNNING, but do not wake the task because it 
	 * for a lock wakeup. Also indicate success because f		 * for a lock wakeup. Also indicate success because f
	 * the regular waker's point of view this has succeed		 * the regular waker's point of view this has succeed
	 *								 *
	 * After acquiring the lock the task will restore p::		 * After acquiring the lock the task will restore p::
	 * from p::saved_state which ensures that the regular		 * from p::saved_state which ensures that the regular
	 * wakeup is not lost. The restore will also set		 * wakeup is not lost. The restore will also set
	 * p::saved_state to TASK_RUNNING so any further test		 * p::saved_state to TASK_RUNNING so any further test
	 * not result in false positives vs. @success			 * not result in false positives vs. @success
	 */								 */
	if (match < 0)							if (match < 0)
		p->saved_state = TASK_RUNNING;					p->saved_state = TASK_RUNNING;
#endif								#endif
	return match > 0;						return match > 0;
}								}

/*								/*
 * Notes on Program-Order guarantees on SMP systems.		 * Notes on Program-Order guarantees on SMP systems.
 *								 *
 *  MIGRATION							 *  MIGRATION
 *								 *
 * The basic program-order guarantee on SMP systems is that w	 * The basic program-order guarantee on SMP systems is that w
 * migrates, all its activity on its old CPU [c0] happens-bef	 * migrates, all its activity on its old CPU [c0] happens-bef
 * execution on its new CPU [c1].				 * execution on its new CPU [c1].
 *								 *
 * For migration (of runnable tasks) this is provided by the 	 * For migration (of runnable tasks) this is provided by the 
 *								 *
 *  A) UNLOCK of the rq(c0)->lock scheduling out task t		 *  A) UNLOCK of the rq(c0)->lock scheduling out task t
 *  B) migration for t is required to synchronize *both* rq(c	 *  B) migration for t is required to synchronize *both* rq(c
 *     rq(c1)->lock (if not at the same time, then in that or	 *     rq(c1)->lock (if not at the same time, then in that or
 *  C) LOCK of the rq(c1)->lock scheduling in task		 *  C) LOCK of the rq(c1)->lock scheduling in task
 *								 *
 * Release/acquire chaining guarantees that B happens after A	 * Release/acquire chaining guarantees that B happens after A
 * Note: the CPU doing B need not be c0 or c1			 * Note: the CPU doing B need not be c0 or c1
 *								 *
 * Example:							 * Example:
 *								 *
 *   CPU0            CPU1            CPU2			 *   CPU0            CPU1            CPU2
 *								 *
 *   LOCK rq(0)->lock						 *   LOCK rq(0)->lock
 *   sched-out X						 *   sched-out X
 *   sched-in Y							 *   sched-in Y
 *   UNLOCK rq(0)->lock						 *   UNLOCK rq(0)->lock
 *								 *
 *                                   LOCK rq(0)->lock // orde	 *                                   LOCK rq(0)->lock // orde
 *                                   dequeue X			 *                                   dequeue X
 *                                   UNLOCK rq(0)->lock		 *                                   UNLOCK rq(0)->lock
 *								 *
 *                                   LOCK rq(1)->lock		 *                                   LOCK rq(1)->lock
 *                                   enqueue X			 *                                   enqueue X
 *                                   UNLOCK rq(1)->lock		 *                                   UNLOCK rq(1)->lock
 *								 *
 *                   LOCK rq(1)->lock // orders against CPU2	 *                   LOCK rq(1)->lock // orders against CPU2
 *                   sched-out Z				 *                   sched-out Z
 *                   sched-in X					 *                   sched-in X
 *                   UNLOCK rq(1)->lock				 *                   UNLOCK rq(1)->lock
 *								 *
 *								 *
 *  BLOCKING -- aka. SLEEP + WAKEUP				 *  BLOCKING -- aka. SLEEP + WAKEUP
 *								 *
 * For blocking we (obviously) need to provide the same guara	 * For blocking we (obviously) need to provide the same guara
 * migration. However the means are completely different as t	 * migration. However the means are completely different as t
 * chain to provide order. Instead we do:			 * chain to provide order. Instead we do:
 *								 *
 *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()	 *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()
 *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()	 *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
 *								 *
 * Example:							 * Example:
 *								 *
 *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)	 *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
 *								 *
 *   LOCK rq(0)->lock LOCK X->pi_lock				 *   LOCK rq(0)->lock LOCK X->pi_lock
 *   dequeue X							 *   dequeue X
 *   sched-out X						 *   sched-out X
 *   smp_store_release(X->on_cpu, 0);				 *   smp_store_release(X->on_cpu, 0);
 *								 *
 *                    smp_cond_load_acquire(&X->on_cpu, !VAL)	 *                    smp_cond_load_acquire(&X->on_cpu, !VAL)
 *                    X->state = WAKING				 *                    X->state = WAKING
 *                    set_task_cpu(X,2)				 *                    set_task_cpu(X,2)
 *								 *
 *                    LOCK rq(2)->lock				 *                    LOCK rq(2)->lock
 *                    enqueue X					 *                    enqueue X
 *                    X->state = RUNNING			 *                    X->state = RUNNING
 *                    UNLOCK rq(2)->lock			 *                    UNLOCK rq(2)->lock
 *								 *
 *                                          LOCK rq(2)->lock 	 *                                          LOCK rq(2)->lock 
 *                                          sched-out Z		 *                                          sched-out Z
 *                                          sched-in X		 *                                          sched-in X
 *                                          UNLOCK rq(2)->loc	 *                                          UNLOCK rq(2)->loc
 *								 *
 *                    UNLOCK X->pi_lock				 *                    UNLOCK X->pi_lock
 *   UNLOCK rq(0)->lock						 *   UNLOCK rq(0)->lock
 *								 *
 *								 *
 * However, for wakeups there is a second guarantee we must p	 * However, for wakeups there is a second guarantee we must p
 * must ensure that CONDITION=1 done by the caller can not be	 * must ensure that CONDITION=1 done by the caller can not be
 * accesses to the task state; see try_to_wake_up() and set_c	 * accesses to the task state; see try_to_wake_up() and set_c
 */								 */

/**								/**
 * try_to_wake_up - wake up a thread				 * try_to_wake_up - wake up a thread
 * @p: the thread to be awakened				 * @p: the thread to be awakened
 * @state: the mask of task states that can be woken		 * @state: the mask of task states that can be woken
 * @wake_flags: wake modifier flags (WF_*)			 * @wake_flags: wake modifier flags (WF_*)
 *								 *
 * Conceptually does:						 * Conceptually does:
 *								 *
 *   If (@state & @p->state) @p->state = TASK_RUNNING.		 *   If (@state & @p->state) @p->state = TASK_RUNNING.
 *								 *
 * If the task was not queued/runnable, also place it back on	 * If the task was not queued/runnable, also place it back on
 *								 *
 * This function is atomic against schedule() which would deq	 * This function is atomic against schedule() which would deq
 *								 *
 * It issues a full memory barrier before accessing @p->state	 * It issues a full memory barrier before accessing @p->state
 * with set_current_state().					 * with set_current_state().
 *								 *
 * Uses p->pi_lock to serialize against concurrent wake-ups.	 * Uses p->pi_lock to serialize against concurrent wake-ups.
 *								 *
 * Relies on p->pi_lock stabilizing:				 * Relies on p->pi_lock stabilizing:
 *  - p->sched_class						 *  - p->sched_class
 *  - p->cpus_ptr						 *  - p->cpus_ptr
 *  - p->sched_task_group					 *  - p->sched_task_group
 * in order to do migration, see its use of select_task_rq()/	 * in order to do migration, see its use of select_task_rq()/
 *								 *
 * Tries really hard to only take one task_rq(p)->lock for pe	 * Tries really hard to only take one task_rq(p)->lock for pe
 * Takes rq->lock in:						 * Takes rq->lock in:
 *  - ttwu_runnable()    -- old rq, unavoidable, see comment 	 *  - ttwu_runnable()    -- old rq, unavoidable, see comment 
 *  - ttwu_queue()       -- new rq, for enqueue of the task;	 *  - ttwu_queue()       -- new rq, for enqueue of the task;
 *  - psi_ttwu_dequeue() -- much sadness :-( accounting will 	 *  - psi_ttwu_dequeue() -- much sadness :-( accounting will 
 *								 *
 * As a consequence we race really badly with just about ever	 * As a consequence we race really badly with just about ever
 * many memory barriers and their comments for details.		 * many memory barriers and their comments for details.
 *								 *
 * Return: %true if @p->state changes (an actual wakeup was d	 * Return: %true if @p->state changes (an actual wakeup was d
 *	   %false otherwise.					 *	   %false otherwise.
 */								 */
int try_to_wake_up(struct task_struct *p, unsigned int state,	int try_to_wake_up(struct task_struct *p, unsigned int state,
{								{
	guard(preempt)();						guard(preempt)();
	int cpu, success = 0;						int cpu, success = 0;

	if (p == current) {						if (p == current) {
		/*								/*
		 * We're waking current, this means 'p->on_rq			 * We're waking current, this means 'p->on_rq
		 * == smp_processor_id()'. Together this mean			 * == smp_processor_id()'. Together this mean
		 * case the whole 'p->on_rq && ttwu_runnable(			 * case the whole 'p->on_rq && ttwu_runnable(
		 * without taking any locks.					 * without taking any locks.
		 *								 *
		 * In particular:						 * In particular:
		 *  - we rely on Program-Order guarantees for			 *  - we rely on Program-Order guarantees for
		 *  - we're serialized against set_special_st			 *  - we're serialized against set_special_st
		 *    it disabling IRQs (this allows not taki			 *    it disabling IRQs (this allows not taki
		 */								 */
		if (!ttwu_state_match(p, state, &success))			if (!ttwu_state_match(p, state, &success))
			goto out;							goto out;

		trace_sched_waking(p);						trace_sched_waking(p);
		ttwu_do_wakeup(p);						ttwu_do_wakeup(p);
		goto out;							goto out;
	}								}

	/*								/*
	 * If we are going to wake up a thread waiting for CO		 * If we are going to wake up a thread waiting for CO
	 * need to ensure that CONDITION=1 done by the caller		 * need to ensure that CONDITION=1 done by the caller
	 * reordered with p->state check below. This pairs wi		 * reordered with p->state check below. This pairs wi
	 * in set_current_state() that the waiting thread doe		 * in set_current_state() that the waiting thread doe
	 */								 */
	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {		scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
		smp_mb__after_spinlock();					smp_mb__after_spinlock();
		if (!ttwu_state_match(p, state, &success))			if (!ttwu_state_match(p, state, &success))
			break;								break;

		trace_sched_waking(p);						trace_sched_waking(p);

		/*								/*
		 * Ensure we load p->on_rq _after_ p->state, 			 * Ensure we load p->on_rq _after_ p->state, 
		 * be possible to, falsely, observe p->on_rq 			 * be possible to, falsely, observe p->on_rq 
		 * in smp_cond_load_acquire() below.				 * in smp_cond_load_acquire() below.
		 *								 *
		 * sched_ttwu_pending()			try_t			 * sched_ttwu_pending()			try_t
		 *   STORE p->on_rq = 1			  LOA			 *   STORE p->on_rq = 1			  LOA
		 *   UNLOCK rq->lock						 *   UNLOCK rq->lock
		 *								 *
		 * __schedule() (switch to task 'p')				 * __schedule() (switch to task 'p')
		 *   LOCK rq->lock			  smp			 *   LOCK rq->lock			  smp
		 *   smp_mb__after_spinlock();					 *   smp_mb__after_spinlock();
		 *   UNLOCK rq->lock						 *   UNLOCK rq->lock
		 *								 *
		 * [task p]							 * [task p]
		 *   STORE p->state = UNINTERRUPTIBLE	  LOA			 *   STORE p->state = UNINTERRUPTIBLE	  LOA
		 *								 *
		 * Pairs with the LOCK+smp_mb__after_spinlock			 * Pairs with the LOCK+smp_mb__after_spinlock
		 * __schedule().  See the comment for smp_mb_			 * __schedule().  See the comment for smp_mb_
		 *								 *
		 * A similar smb_rmb() lives in try_invoke_on			 * A similar smb_rmb() lives in try_invoke_on
		 */								 */
		smp_rmb();							smp_rmb();
		if (READ_ONCE(p->on_rq) && ttwu_runnable(p, w			if (READ_ONCE(p->on_rq) && ttwu_runnable(p, w
			break;								break;

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
		/*								/*
		 * Ensure we load p->on_cpu _after_ p->on_rq,			 * Ensure we load p->on_cpu _after_ p->on_rq,
		 * possible to, falsely, observe p->on_cpu ==			 * possible to, falsely, observe p->on_cpu ==
		 *								 *
		 * One must be running (->on_cpu == 1) in ord			 * One must be running (->on_cpu == 1) in ord
		 * from the runqueue.						 * from the runqueue.
		 *								 *
		 * __schedule() (switch to task 'p')	try_t			 * __schedule() (switch to task 'p')	try_t
		 *   STORE p->on_cpu = 1		  LOA			 *   STORE p->on_cpu = 1		  LOA
		 *   UNLOCK rq->lock						 *   UNLOCK rq->lock
		 *								 *
		 * __schedule() (put 'p' to sleep)				 * __schedule() (put 'p' to sleep)
		 *   LOCK rq->lock			  smp			 *   LOCK rq->lock			  smp
		 *   smp_mb__after_spinlock();					 *   smp_mb__after_spinlock();
		 *   STORE p->on_rq = 0			  LOA			 *   STORE p->on_rq = 0			  LOA
		 *								 *
		 * Pairs with the LOCK+smp_mb__after_spinlock			 * Pairs with the LOCK+smp_mb__after_spinlock
		 * __schedule().  See the comment for smp_mb_			 * __schedule().  See the comment for smp_mb_
		 *								 *
		 * Form a control-dep-acquire with p->on_rq =			 * Form a control-dep-acquire with p->on_rq =
		 * schedule()'s deactivate_task() has 'happen			 * schedule()'s deactivate_task() has 'happen
		 * care about it's own p->state. See the comm			 * care about it's own p->state. See the comm
		 */								 */
		smp_acquire__after_ctrl_dep();					smp_acquire__after_ctrl_dep();

		/*								/*
		 * We're doing the wakeup (@success == 1), th			 * We're doing the wakeup (@success == 1), th
		 * == 0), which means we need to do an enqueu			 * == 0), which means we need to do an enqueu
		 * TASK_WAKING such that we can unlock p->pi_			 * TASK_WAKING such that we can unlock p->pi_
		 * enqueue, such as ttwu_queue_wakelist().			 * enqueue, such as ttwu_queue_wakelist().
		 */								 */
		WRITE_ONCE(p->__state, TASK_WAKING);				WRITE_ONCE(p->__state, TASK_WAKING);

		/*								/*
		 * If the owning (remote) CPU is still in the			 * If the owning (remote) CPU is still in the
		 * this task as prev, considering queueing p 			 * this task as prev, considering queueing p 
		 * which potentially sends an IPI instead of 			 * which potentially sends an IPI instead of 
		 * let the waker make forward progress. This 			 * let the waker make forward progress. This 
		 * disabled and the IPI will deliver after on			 * disabled and the IPI will deliver after on
		 *								 *
		 * Ensure we load task_cpu(p) after p->on_cpu			 * Ensure we load task_cpu(p) after p->on_cpu
		 *								 *
		 * set_task_cpu(p, cpu);					 * set_task_cpu(p, cpu);
		 *   STORE p->cpu = @cpu					 *   STORE p->cpu = @cpu
		 * __schedule() (switch to task 'p')				 * __schedule() (switch to task 'p')
		 *   LOCK rq->lock						 *   LOCK rq->lock
		 *   smp_mb__after_spin_lock()		smp_c			 *   smp_mb__after_spin_lock()		smp_c
		 *   STORE p->on_cpu = 1		LOAD 			 *   STORE p->on_cpu = 1		LOAD 
		 *								 *
		 * to ensure we observe the correct CPU on wh			 * to ensure we observe the correct CPU on wh
		 * scheduling.							 * scheduling.
		 */								 */
		if (smp_load_acquire(&p->on_cpu) &&				if (smp_load_acquire(&p->on_cpu) &&
		    ttwu_queue_wakelist(p, task_cpu(p), wake_			    ttwu_queue_wakelist(p, task_cpu(p), wake_
			break;								break;

		/*								/*
		 * If the owning (remote) CPU is still in the			 * If the owning (remote) CPU is still in the
		 * this task as prev, wait until it's done re			 * this task as prev, wait until it's done re
		 *								 *
		 * Pairs with the smp_store_release() in fini			 * Pairs with the smp_store_release() in fini
		 *								 *
		 * This ensures that tasks getting woken will			 * This ensures that tasks getting woken will
		 * their previous state and preserve Program 			 * their previous state and preserve Program 
		 */								 */
		smp_cond_load_acquire(&p->on_cpu, !VAL);			smp_cond_load_acquire(&p->on_cpu, !VAL);

		cpu = select_task_rq(p, p->wake_cpu, wake_fla			cpu = select_task_rq(p, p->wake_cpu, wake_fla
		if (task_cpu(p) != cpu) {					if (task_cpu(p) != cpu) {
			if (p->in_iowait) {						if (p->in_iowait) {
				delayacct_blkio_end(p);						delayacct_blkio_end(p);
				atomic_dec(&task_rq(p)->nr_io					atomic_dec(&task_rq(p)->nr_io
			}								}

			wake_flags |= WF_MIGRATED;					wake_flags |= WF_MIGRATED;
			psi_ttwu_dequeue(p);						psi_ttwu_dequeue(p);
			set_task_cpu(p, cpu);						set_task_cpu(p, cpu);
		}								}
#else								#else
		cpu = task_cpu(p);						cpu = task_cpu(p);
#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */

		ttwu_queue(p, cpu, wake_flags);					ttwu_queue(p, cpu, wake_flags);
	}								}
out:								out:
	if (success)							if (success)
		ttwu_stat(p, task_cpu(p), wake_flags);				ttwu_stat(p, task_cpu(p), wake_flags);

	return success;							return success;
}								}

static bool __task_needs_rq_lock(struct task_struct *p)		static bool __task_needs_rq_lock(struct task_struct *p)
{								{
	unsigned int state = READ_ONCE(p->__state);			unsigned int state = READ_ONCE(p->__state);

	/*								/*
	 * Since pi->lock blocks try_to_wake_up(), we don't n		 * Since pi->lock blocks try_to_wake_up(), we don't n
	 * the task is blocked. Make sure to check @state sin		 * the task is blocked. Make sure to check @state sin
	 * locks at the end, see ttwu_queue_wakelist().			 * locks at the end, see ttwu_queue_wakelist().
	 */								 */
	if (state == TASK_RUNNING || state == TASK_WAKING)		if (state == TASK_RUNNING || state == TASK_WAKING)
		return true;							return true;

	/*								/*
	 * Ensure we load p->on_rq after p->__state, otherwis		 * Ensure we load p->on_rq after p->__state, otherwis
	 * possible to, falsely, observe p->on_rq == 0.			 * possible to, falsely, observe p->on_rq == 0.
	 *								 *
	 * See try_to_wake_up() for a longer comment.			 * See try_to_wake_up() for a longer comment.
	 */								 */
	smp_rmb();							smp_rmb();
	if (p->on_rq)							if (p->on_rq)
		return true;							return true;

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	/*								/*
	 * Ensure the task has finished __schedule() and will		 * Ensure the task has finished __schedule() and will
	 * anymore. Again, see try_to_wake_up() for a longer 		 * anymore. Again, see try_to_wake_up() for a longer 
	 */								 */
	smp_rmb();							smp_rmb();
	smp_cond_load_acquire(&p->on_cpu, !VAL);			smp_cond_load_acquire(&p->on_cpu, !VAL);
#endif								#endif

	return false;							return false;
}								}

/**								/**
 * task_call_func - Invoke a function on task in fixed state	 * task_call_func - Invoke a function on task in fixed state
 * @p: Process for which the function is to be invoked, can b	 * @p: Process for which the function is to be invoked, can b
 * @func: Function to invoke.					 * @func: Function to invoke.
 * @arg: Argument to function.					 * @arg: Argument to function.
 *								 *
 * Fix the task in it's current state by avoiding wakeups and	 * Fix the task in it's current state by avoiding wakeups and
 * and call @func(@arg) on it.  This function can use ->on_rq	 * and call @func(@arg) on it.  This function can use ->on_rq
 * to work out what the state is, if required.  Given that @f	 * to work out what the state is, if required.  Given that @f
 * with a runqueue lock held, it had better be quite lightwei	 * with a runqueue lock held, it had better be quite lightwei
 *								 *
 * Returns:							 * Returns:
 *   Whatever @func returns					 *   Whatever @func returns
 */								 */
int task_call_func(struct task_struct *p, task_call_f func, v	int task_call_func(struct task_struct *p, task_call_f func, v
{								{
	struct rq *rq = NULL;						struct rq *rq = NULL;
	struct rq_flags rf;						struct rq_flags rf;
	int ret;							int ret;

	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);			raw_spin_lock_irqsave(&p->pi_lock, rf.flags);

	if (__task_needs_rq_lock(p))					if (__task_needs_rq_lock(p))
		rq = __task_rq_lock(p, &rf);					rq = __task_rq_lock(p, &rf);

	/*								/*
	 * At this point the task is pinned; either:			 * At this point the task is pinned; either:
	 *  - blocked and we're holding off wakeups	 (pi-		 *  - blocked and we're holding off wakeups	 (pi-
	 *  - woken, and we're holding off enqueue	 (rq-		 *  - woken, and we're holding off enqueue	 (rq-
	 *  - queued, and we're holding off schedule	 (rq-		 *  - queued, and we're holding off schedule	 (rq-
	 *  - running, and we're holding off de-schedule (rq-		 *  - running, and we're holding off de-schedule (rq-
	 *								 *
	 * The called function (@func) can use: task_curr(), 		 * The called function (@func) can use: task_curr(), 
	 * p->__state to differentiate between these states.		 * p->__state to differentiate between these states.
	 */								 */
	ret = func(p, arg);						ret = func(p, arg);

	if (rq)								if (rq)
		rq_unlock(rq, &rf);						rq_unlock(rq, &rf);

	raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);		raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
	return ret;							return ret;
}								}

/**								/**
 * cpu_curr_snapshot - Return a snapshot of the currently run	 * cpu_curr_snapshot - Return a snapshot of the currently run
 * @cpu: The CPU on which to snapshot the task.			 * @cpu: The CPU on which to snapshot the task.
 *								 *
 * Returns the task_struct pointer of the task "currently" ru	 * Returns the task_struct pointer of the task "currently" ru
 * the specified CPU.  If the same task is running on that CP	 * the specified CPU.  If the same task is running on that CP
 * the return value will be a pointer to that task's task_str	 * the return value will be a pointer to that task's task_str
 * If the CPU did any context switches even vaguely concurren	 * If the CPU did any context switches even vaguely concurren
 * execution of this function, the return value will be a poi	 * execution of this function, the return value will be a poi
 * task_struct structure of a randomly chosen task that was r	 * task_struct structure of a randomly chosen task that was r
 * that CPU somewhere around the time that this function was 	 * that CPU somewhere around the time that this function was 
 *								 *
 * If the specified CPU was offline, the return value is what	 * If the specified CPU was offline, the return value is what
 * is, perhaps a pointer to the task_struct structure of that	 * is, perhaps a pointer to the task_struct structure of that
 * task, but there is no guarantee.  Callers wishing a useful	 * task, but there is no guarantee.  Callers wishing a useful
 * value must take some action to ensure that the specified C	 * value must take some action to ensure that the specified C
 * online throughout.						 * online throughout.
 *								 *
 * This function executes full memory barriers before and aft	 * This function executes full memory barriers before and aft
 * the pointer, which permits the caller to confine this func	 * the pointer, which permits the caller to confine this func
 * with respect to the caller's accesses to other shared vari	 * with respect to the caller's accesses to other shared vari
 */								 */
struct task_struct *cpu_curr_snapshot(int cpu)			struct task_struct *cpu_curr_snapshot(int cpu)
{								{
	struct task_struct *t;						struct task_struct *t;

	smp_mb(); /* Pairing determined by caller's synchroni		smp_mb(); /* Pairing determined by caller's synchroni
	t = rcu_dereference(cpu_curr(cpu));				t = rcu_dereference(cpu_curr(cpu));
	smp_mb(); /* Pairing determined by caller's synchroni		smp_mb(); /* Pairing determined by caller's synchroni
	return t;							return t;
}								}

/**								/**
 * wake_up_process - Wake up a specific process			 * wake_up_process - Wake up a specific process
 * @p: The process to be woken up.				 * @p: The process to be woken up.
 *								 *
 * Attempt to wake up the nominated process and move it to th	 * Attempt to wake up the nominated process and move it to th
 * processes.							 * processes.
 *								 *
 * Return: 1 if the process was woken up, 0 if it was already	 * Return: 1 if the process was woken up, 0 if it was already
 *								 *
 * This function executes a full memory barrier before access	 * This function executes a full memory barrier before access
 */								 */
int wake_up_process(struct task_struct *p)			int wake_up_process(struct task_struct *p)
{								{
	return try_to_wake_up(p, TASK_NORMAL, 0);			return try_to_wake_up(p, TASK_NORMAL, 0);
}								}
EXPORT_SYMBOL(wake_up_process);					EXPORT_SYMBOL(wake_up_process);

int wake_up_state(struct task_struct *p, unsigned int state)	int wake_up_state(struct task_struct *p, unsigned int state)
{								{
	return try_to_wake_up(p, state, 0);				return try_to_wake_up(p, state, 0);
}								}

/*								/*
 * Perform scheduler related setup for a newly forked process	 * Perform scheduler related setup for a newly forked process
 * p is forked by current.					 * p is forked by current.
 *								 *
 * __sched_fork() is basic setup used by init_idle() too:	 * __sched_fork() is basic setup used by init_idle() too:
 */								 */
static void __sched_fork(unsigned long clone_flags, struct ta	static void __sched_fork(unsigned long clone_flags, struct ta
{								{
	p->on_rq			= 0;				p->on_rq			= 0;

	p->se.on_rq			= 0;				p->se.on_rq			= 0;
	p->se.exec_start		= 0;				p->se.exec_start		= 0;
	p->se.sum_exec_runtime		= 0;				p->se.sum_exec_runtime		= 0;
	p->se.prev_sum_exec_runtime	= 0;				p->se.prev_sum_exec_runtime	= 0;
	p->se.nr_migrations		= 0;				p->se.nr_migrations		= 0;
	p->se.vruntime			= 0;				p->se.vruntime			= 0;
	p->se.vlag			= 0;				p->se.vlag			= 0;
	p->se.slice			= sysctl_sched_base_s		p->se.slice			= sysctl_sched_base_s
	INIT_LIST_HEAD(&p->se.group_node);				INIT_LIST_HEAD(&p->se.group_node);

#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
	p->se.cfs_rq			= NULL;				p->se.cfs_rq			= NULL;
#endif								#endif

#ifdef CONFIG_SCHEDSTATS					#ifdef CONFIG_SCHEDSTATS
	/* Even if schedstat is disabled, there should not be		/* Even if schedstat is disabled, there should not be
	memset(&p->stats, 0, sizeof(p->stats));				memset(&p->stats, 0, sizeof(p->stats));
#endif								#endif

	RB_CLEAR_NODE(&p->dl.rb_node);					RB_CLEAR_NODE(&p->dl.rb_node);
	init_dl_task_timer(&p->dl);					init_dl_task_timer(&p->dl);
	init_dl_inactive_task_timer(&p->dl);				init_dl_inactive_task_timer(&p->dl);
	__dl_clear_params(p);						__dl_clear_params(p);

	INIT_LIST_HEAD(&p->rt.run_list);				INIT_LIST_HEAD(&p->rt.run_list);
	p->rt.timeout		= 0;					p->rt.timeout		= 0;
	p->rt.time_slice	= sched_rr_timeslice;			p->rt.time_slice	= sched_rr_timeslice;
	p->rt.on_rq		= 0;					p->rt.on_rq		= 0;
	p->rt.on_list		= 0;					p->rt.on_list		= 0;

#ifdef CONFIG_PREEMPT_NOTIFIERS					#ifdef CONFIG_PREEMPT_NOTIFIERS
	INIT_HLIST_HEAD(&p->preempt_notifiers);				INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif								#endif

#ifdef CONFIG_COMPACTION					#ifdef CONFIG_COMPACTION
	p->capture_control = NULL;					p->capture_control = NULL;
#endif								#endif
	init_numa_balancing(clone_flags, p);				init_numa_balancing(clone_flags, p);
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	p->wake_entry.u_flags = CSD_TYPE_TTWU;				p->wake_entry.u_flags = CSD_TYPE_TTWU;
	p->migration_pending = NULL;					p->migration_pending = NULL;
#endif								#endif
	init_sched_mm_cid(p);						init_sched_mm_cid(p);
}								}

DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);			DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);

#ifdef CONFIG_NUMA_BALANCING					#ifdef CONFIG_NUMA_BALANCING

int sysctl_numa_balancing_mode;					int sysctl_numa_balancing_mode;

static void __set_numabalancing_state(bool enabled)		static void __set_numabalancing_state(bool enabled)
{								{
	if (enabled)							if (enabled)
		static_branch_enable(&sched_numa_balancing);			static_branch_enable(&sched_numa_balancing);
	else								else
		static_branch_disable(&sched_numa_balancing);			static_branch_disable(&sched_numa_balancing);
}								}

void set_numabalancing_state(bool enabled)			void set_numabalancing_state(bool enabled)
{								{
	if (enabled)							if (enabled)
		sysctl_numa_balancing_mode = NUMA_BALANCING_N			sysctl_numa_balancing_mode = NUMA_BALANCING_N
	else								else
		sysctl_numa_balancing_mode = NUMA_BALANCING_D			sysctl_numa_balancing_mode = NUMA_BALANCING_D
	__set_numabalancing_state(enabled);				__set_numabalancing_state(enabled);
}								}

#ifdef CONFIG_PROC_SYSCTL					#ifdef CONFIG_PROC_SYSCTL
static void reset_memory_tiering(void)				static void reset_memory_tiering(void)
{								{
	struct pglist_data *pgdat;					struct pglist_data *pgdat;

	for_each_online_pgdat(pgdat) {					for_each_online_pgdat(pgdat) {
		pgdat->nbp_threshold = 0;					pgdat->nbp_threshold = 0;
		pgdat->nbp_th_nr_cand = node_page_state(pgdat			pgdat->nbp_th_nr_cand = node_page_state(pgdat
		pgdat->nbp_th_start = jiffies_to_msecs(jiffie			pgdat->nbp_th_start = jiffies_to_msecs(jiffie
	}								}
}								}

static int sysctl_numa_balancing(struct ctl_table *table, int	static int sysctl_numa_balancing(struct ctl_table *table, int
			  void *buffer, size_t *lenp, loff_t 				  void *buffer, size_t *lenp, loff_t 
{								{
	struct ctl_table t;						struct ctl_table t;
	int err;							int err;
	int state = sysctl_numa_balancing_mode;				int state = sysctl_numa_balancing_mode;

	if (write && !capable(CAP_SYS_ADMIN))				if (write && !capable(CAP_SYS_ADMIN))
		return -EPERM;							return -EPERM;

	t = *table;							t = *table;
	t.data = &state;						t.data = &state;
	err = proc_dointvec_minmax(&t, write, buffer, lenp, p		err = proc_dointvec_minmax(&t, write, buffer, lenp, p
	if (err < 0)							if (err < 0)
		return err;							return err;
	if (write) {							if (write) {
		if (!(sysctl_numa_balancing_mode & NUMA_BALAN			if (!(sysctl_numa_balancing_mode & NUMA_BALAN
		    (state & NUMA_BALANCING_MEMORY_TIERING))			    (state & NUMA_BALANCING_MEMORY_TIERING))
			reset_memory_tiering();						reset_memory_tiering();
		sysctl_numa_balancing_mode = state;				sysctl_numa_balancing_mode = state;
		__set_numabalancing_state(state);				__set_numabalancing_state(state);
	}								}
	return err;							return err;
}								}
#endif								#endif
#endif								#endif

#ifdef CONFIG_SCHEDSTATS					#ifdef CONFIG_SCHEDSTATS

DEFINE_STATIC_KEY_FALSE(sched_schedstats);			DEFINE_STATIC_KEY_FALSE(sched_schedstats);

static void set_schedstats(bool enabled)			static void set_schedstats(bool enabled)
{								{
	if (enabled)							if (enabled)
		static_branch_enable(&sched_schedstats);			static_branch_enable(&sched_schedstats);
	else								else
		static_branch_disable(&sched_schedstats);			static_branch_disable(&sched_schedstats);
}								}

void force_schedstat_enabled(void)				void force_schedstat_enabled(void)
{								{
	if (!schedstat_enabled()) {					if (!schedstat_enabled()) {
		pr_info("kernel profiling enabled schedstats,			pr_info("kernel profiling enabled schedstats,
		static_branch_enable(&sched_schedstats);			static_branch_enable(&sched_schedstats);
	}								}
}								}

static int __init setup_schedstats(char *str)			static int __init setup_schedstats(char *str)
{								{
	int ret = 0;							int ret = 0;
	if (!str)							if (!str)
		goto out;							goto out;

	if (!strcmp(str, "enable")) {					if (!strcmp(str, "enable")) {
		set_schedstats(true);						set_schedstats(true);
		ret = 1;							ret = 1;
	} else if (!strcmp(str, "disable")) {				} else if (!strcmp(str, "disable")) {
		set_schedstats(false);						set_schedstats(false);
		ret = 1;							ret = 1;
	}								}
out:								out:
	if (!ret)							if (!ret)
		pr_warn("Unable to parse schedstats=\n");			pr_warn("Unable to parse schedstats=\n");

	return ret;							return ret;
}								}
__setup("schedstats=", setup_schedstats);			__setup("schedstats=", setup_schedstats);

#ifdef CONFIG_PROC_SYSCTL					#ifdef CONFIG_PROC_SYSCTL
static int sysctl_schedstats(struct ctl_table *table, int wri	static int sysctl_schedstats(struct ctl_table *table, int wri
		size_t *lenp, loff_t *ppos)					size_t *lenp, loff_t *ppos)
{								{
	struct ctl_table t;						struct ctl_table t;
	int err;							int err;
	int state = static_branch_likely(&sched_schedstats);		int state = static_branch_likely(&sched_schedstats);

	if (write && !capable(CAP_SYS_ADMIN))				if (write && !capable(CAP_SYS_ADMIN))
		return -EPERM;							return -EPERM;

	t = *table;							t = *table;
	t.data = &state;						t.data = &state;
	err = proc_dointvec_minmax(&t, write, buffer, lenp, p		err = proc_dointvec_minmax(&t, write, buffer, lenp, p
	if (err < 0)							if (err < 0)
		return err;							return err;
	if (write)							if (write)
		set_schedstats(state);						set_schedstats(state);
	return err;							return err;
}								}
#endif /* CONFIG_PROC_SYSCTL */					#endif /* CONFIG_PROC_SYSCTL */
#endif /* CONFIG_SCHEDSTATS */					#endif /* CONFIG_SCHEDSTATS */

#ifdef CONFIG_SYSCTL						#ifdef CONFIG_SYSCTL
static struct ctl_table sched_core_sysctls[] = {		static struct ctl_table sched_core_sysctls[] = {
#ifdef CONFIG_SCHEDSTATS					#ifdef CONFIG_SCHEDSTATS
	{								{
		.procname       = "sched_schedstats",				.procname       = "sched_schedstats",
		.data           = NULL,						.data           = NULL,
		.maxlen         = sizeof(unsigned int),				.maxlen         = sizeof(unsigned int),
		.mode           = 0644,						.mode           = 0644,
		.proc_handler   = sysctl_schedstats,				.proc_handler   = sysctl_schedstats,
		.extra1         = SYSCTL_ZERO,					.extra1         = SYSCTL_ZERO,
		.extra2         = SYSCTL_ONE,					.extra2         = SYSCTL_ONE,
	},								},
#endif /* CONFIG_SCHEDSTATS */					#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_UCLAMP_TASK					#ifdef CONFIG_UCLAMP_TASK
	{								{
		.procname       = "sched_util_clamp_min",			.procname       = "sched_util_clamp_min",
		.data           = &sysctl_sched_uclamp_util_m			.data           = &sysctl_sched_uclamp_util_m
		.maxlen         = sizeof(unsigned int),				.maxlen         = sizeof(unsigned int),
		.mode           = 0644,						.mode           = 0644,
		.proc_handler   = sysctl_sched_uclamp_handler			.proc_handler   = sysctl_sched_uclamp_handler
	},								},
	{								{
		.procname       = "sched_util_clamp_max",			.procname       = "sched_util_clamp_max",
		.data           = &sysctl_sched_uclamp_util_m			.data           = &sysctl_sched_uclamp_util_m
		.maxlen         = sizeof(unsigned int),				.maxlen         = sizeof(unsigned int),
		.mode           = 0644,						.mode           = 0644,
		.proc_handler   = sysctl_sched_uclamp_handler			.proc_handler   = sysctl_sched_uclamp_handler
	},								},
	{								{
		.procname       = "sched_util_clamp_min_rt_de			.procname       = "sched_util_clamp_min_rt_de
		.data           = &sysctl_sched_uclamp_util_m			.data           = &sysctl_sched_uclamp_util_m
		.maxlen         = sizeof(unsigned int),				.maxlen         = sizeof(unsigned int),
		.mode           = 0644,						.mode           = 0644,
		.proc_handler   = sysctl_sched_uclamp_handler			.proc_handler   = sysctl_sched_uclamp_handler
	},								},
#endif /* CONFIG_UCLAMP_TASK */					#endif /* CONFIG_UCLAMP_TASK */
#ifdef CONFIG_NUMA_BALANCING					#ifdef CONFIG_NUMA_BALANCING
	{								{
		.procname	= "numa_balancing",				.procname	= "numa_balancing",
		.data		= NULL, /* filled in by handl			.data		= NULL, /* filled in by handl
		.maxlen		= sizeof(unsigned int),				.maxlen		= sizeof(unsigned int),
		.mode		= 0644,						.mode		= 0644,
		.proc_handler	= sysctl_numa_balancing,			.proc_handler	= sysctl_numa_balancing,
		.extra1		= SYSCTL_ZERO,					.extra1		= SYSCTL_ZERO,
		.extra2		= SYSCTL_FOUR,					.extra2		= SYSCTL_FOUR,
	},								},
#endif /* CONFIG_NUMA_BALANCING */				#endif /* CONFIG_NUMA_BALANCING */
	{}								{}
};								};
static int __init sched_core_sysctl_init(void)			static int __init sched_core_sysctl_init(void)
{								{
	register_sysctl_init("kernel", sched_core_sysctls);		register_sysctl_init("kernel", sched_core_sysctls);
	return 0;							return 0;
}								}
late_initcall(sched_core_sysctl_init);				late_initcall(sched_core_sysctl_init);
#endif /* CONFIG_SYSCTL */					#endif /* CONFIG_SYSCTL */

/*								/*
 * fork()/clone()-time setup:					 * fork()/clone()-time setup:
 */								 */
int sched_fork(unsigned long clone_flags, struct task_struct 	int sched_fork(unsigned long clone_flags, struct task_struct 
{								{
	__sched_fork(clone_flags, p);					__sched_fork(clone_flags, p);
	/*								/*
	 * We mark the process as NEW here. This guarantees t		 * We mark the process as NEW here. This guarantees t
	 * nobody will actually run it, and a signal or other		 * nobody will actually run it, and a signal or other
	 * event cannot wake it up and insert it on the runqu		 * event cannot wake it up and insert it on the runqu
	 */								 */
	p->__state = TASK_NEW;						p->__state = TASK_NEW;

	/*								/*
	 * Make sure we do not leak PI boosting priority to t		 * Make sure we do not leak PI boosting priority to t
	 */								 */
	p->prio = current->normal_prio;					p->prio = current->normal_prio;

	uclamp_fork(p);							uclamp_fork(p);

	/*								/*
	 * Revert to default priority/policy on fork if reque		 * Revert to default priority/policy on fork if reque
	 */								 */
	if (unlikely(p->sched_reset_on_fork)) {				if (unlikely(p->sched_reset_on_fork)) {
		if (task_has_dl_policy(p) || task_has_rt_poli			if (task_has_dl_policy(p) || task_has_rt_poli
			p->policy = SCHED_NORMAL;					p->policy = SCHED_NORMAL;
			p->static_prio = NICE_TO_PRIO(0);				p->static_prio = NICE_TO_PRIO(0);
			p->rt_priority = 0;						p->rt_priority = 0;
		} else if (PRIO_TO_NICE(p->static_prio) < 0)			} else if (PRIO_TO_NICE(p->static_prio) < 0)
			p->static_prio = NICE_TO_PRIO(0);				p->static_prio = NICE_TO_PRIO(0);

		p->prio = p->normal_prio = p->static_prio;			p->prio = p->normal_prio = p->static_prio;
		set_load_weight(p, false);					set_load_weight(p, false);

		/*								/*
		 * We don't need the reset flag anymore after			 * We don't need the reset flag anymore after
		 * fulfilled its duty:						 * fulfilled its duty:
		 */								 */
		p->sched_reset_on_fork = 0;					p->sched_reset_on_fork = 0;
	}								}

	if (dl_prio(p->prio))						if (dl_prio(p->prio))
		return -EAGAIN;							return -EAGAIN;
	else if (rt_prio(p->prio))					else if (rt_prio(p->prio))
		p->sched_class = &rt_sched_class;				p->sched_class = &rt_sched_class;
	else								else
		p->sched_class = &fair_sched_class;				p->sched_class = &fair_sched_class;

	init_entity_runnable_average(&p->se);				init_entity_runnable_average(&p->se);


#ifdef CONFIG_SCHED_INFO					#ifdef CONFIG_SCHED_INFO
	if (likely(sched_info_on()))					if (likely(sched_info_on()))
		memset(&p->sched_info, 0, sizeof(p->sched_inf			memset(&p->sched_info, 0, sizeof(p->sched_inf
#endif								#endif
#if defined(CONFIG_SMP)						#if defined(CONFIG_SMP)
	p->on_cpu = 0;							p->on_cpu = 0;
#endif								#endif
	init_task_preempt_count(p);					init_task_preempt_count(p);
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	plist_node_init(&p->pushable_tasks, MAX_PRIO);			plist_node_init(&p->pushable_tasks, MAX_PRIO);
	RB_CLEAR_NODE(&p->pushable_dl_tasks);				RB_CLEAR_NODE(&p->pushable_dl_tasks);
#endif								#endif
	return 0;							return 0;
}								}

void sched_cgroup_fork(struct task_struct *p, struct kernel_c	void sched_cgroup_fork(struct task_struct *p, struct kernel_c
{								{
	unsigned long flags;						unsigned long flags;

	/*								/*
	 * Because we're not yet on the pid-hash, p->pi_lock 		 * Because we're not yet on the pid-hash, p->pi_lock 
	 * required yet, but lockdep gets upset if rules are 		 * required yet, but lockdep gets upset if rules are 
	 */								 */
	raw_spin_lock_irqsave(&p->pi_lock, flags);			raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_CGROUP_SCHED					#ifdef CONFIG_CGROUP_SCHED
	if (1) {							if (1) {
		struct task_group *tg;						struct task_group *tg;
		tg = container_of(kargs->cset->subsys[cpu_cgr			tg = container_of(kargs->cset->subsys[cpu_cgr
				  struct task_group, css);					  struct task_group, css);
		tg = autogroup_task_group(p, tg);				tg = autogroup_task_group(p, tg);
		p->sched_task_group = tg;					p->sched_task_group = tg;
	}								}
#endif								#endif
	rseq_migrate(p);						rseq_migrate(p);
	/*								/*
	 * We're setting the CPU for the first time, we don't		 * We're setting the CPU for the first time, we don't
	 * so use __set_task_cpu().					 * so use __set_task_cpu().
	 */								 */
	__set_task_cpu(p, smp_processor_id());				__set_task_cpu(p, smp_processor_id());
	if (p->sched_class->task_fork)					if (p->sched_class->task_fork)
		p->sched_class->task_fork(p);					p->sched_class->task_fork(p);
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);			raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}								}

void sched_post_fork(struct task_struct *p)			void sched_post_fork(struct task_struct *p)
{								{
	uclamp_post_fork(p);						uclamp_post_fork(p);
}								}

unsigned long to_ratio(u64 period, u64 runtime)			unsigned long to_ratio(u64 period, u64 runtime)
{								{
	if (runtime == RUNTIME_INF)					if (runtime == RUNTIME_INF)
		return BW_UNIT;							return BW_UNIT;

	/*								/*
	 * Doing this here saves a lot of checks in all			 * Doing this here saves a lot of checks in all
	 * the calling paths, and returning zero seems			 * the calling paths, and returning zero seems
	 * safe for them anyway.					 * safe for them anyway.
	 */								 */
	if (period == 0)						if (period == 0)
		return 0;							return 0;

	return div64_u64(runtime << BW_SHIFT, period);			return div64_u64(runtime << BW_SHIFT, period);
}								}

/*								/*
 * wake_up_new_task - wake up a newly created task for the fi	 * wake_up_new_task - wake up a newly created task for the fi
 *								 *
 * This function will do some initial scheduler statistics ho	 * This function will do some initial scheduler statistics ho
 * that must be done for every newly created context, then pu	 * that must be done for every newly created context, then pu
 * on the runqueue and wakes it.				 * on the runqueue and wakes it.
 */								 */
void wake_up_new_task(struct task_struct *p)			void wake_up_new_task(struct task_struct *p)
{								{
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	raw_spin_lock_irqsave(&p->pi_lock, rf.flags);			raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
	WRITE_ONCE(p->__state, TASK_RUNNING);				WRITE_ONCE(p->__state, TASK_RUNNING);
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	/*								/*
	 * Fork balancing, do it here and not earlier because		 * Fork balancing, do it here and not earlier because
	 *  - cpus_ptr can change in the fork path			 *  - cpus_ptr can change in the fork path
	 *  - any previously selected CPU might disappear thr		 *  - any previously selected CPU might disappear thr
	 *								 *
	 * Use __set_task_cpu() to avoid calling sched_class:		 * Use __set_task_cpu() to avoid calling sched_class:
	 * as we're not fully set-up yet.				 * as we're not fully set-up yet.
	 */								 */
	p->recent_used_cpu = task_cpu(p);				p->recent_used_cpu = task_cpu(p);
	rseq_migrate(p);						rseq_migrate(p);
	__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_F		__set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_F
#endif								#endif
	rq = __task_rq_lock(p, &rf);					rq = __task_rq_lock(p, &rf);
	update_rq_clock(rq);						update_rq_clock(rq);
	post_init_entity_util_avg(p);					post_init_entity_util_avg(p);

	activate_task(rq, p, ENQUEUE_NOCLOCK);				activate_task(rq, p, ENQUEUE_NOCLOCK);
	trace_sched_wakeup_new(p);					trace_sched_wakeup_new(p);
	check_preempt_curr(rq, p, WF_FORK);				check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	if (p->sched_class->task_woken) {				if (p->sched_class->task_woken) {
		/*								/*
		 * Nothing relies on rq->lock after this, so 			 * Nothing relies on rq->lock after this, so 
		 * drop it.							 * drop it.
		 */								 */
		rq_unpin_lock(rq, &rf);						rq_unpin_lock(rq, &rf);
		p->sched_class->task_woken(rq, p);				p->sched_class->task_woken(rq, p);
		rq_repin_lock(rq, &rf);						rq_repin_lock(rq, &rf);
	}								}
#endif								#endif
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
}								}

#ifdef CONFIG_PREEMPT_NOTIFIERS					#ifdef CONFIG_PREEMPT_NOTIFIERS

static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);		static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);

void preempt_notifier_inc(void)					void preempt_notifier_inc(void)
{								{
	static_branch_inc(&preempt_notifier_key);			static_branch_inc(&preempt_notifier_key);
}								}
EXPORT_SYMBOL_GPL(preempt_notifier_inc);			EXPORT_SYMBOL_GPL(preempt_notifier_inc);

void preempt_notifier_dec(void)					void preempt_notifier_dec(void)
{								{
	static_branch_dec(&preempt_notifier_key);			static_branch_dec(&preempt_notifier_key);
}								}
EXPORT_SYMBOL_GPL(preempt_notifier_dec);			EXPORT_SYMBOL_GPL(preempt_notifier_dec);

/**								/**
 * preempt_notifier_register - tell me when current is being 	 * preempt_notifier_register - tell me when current is being 
 * @notifier: notifier struct to register			 * @notifier: notifier struct to register
 */								 */
void preempt_notifier_register(struct preempt_notifier *notif	void preempt_notifier_register(struct preempt_notifier *notif
{								{
	if (!static_branch_unlikely(&preempt_notifier_key))		if (!static_branch_unlikely(&preempt_notifier_key))
		WARN(1, "registering preempt_notifier while n			WARN(1, "registering preempt_notifier while n

	hlist_add_head(&notifier->link, &current->preempt_not		hlist_add_head(&notifier->link, &current->preempt_not
}								}
EXPORT_SYMBOL_GPL(preempt_notifier_register);			EXPORT_SYMBOL_GPL(preempt_notifier_register);

/**								/**
 * preempt_notifier_unregister - no longer interested in pree	 * preempt_notifier_unregister - no longer interested in pree
 * @notifier: notifier struct to unregister			 * @notifier: notifier struct to unregister
 *								 *
 * This is *not* safe to call from within a preemption notifi	 * This is *not* safe to call from within a preemption notifi
 */								 */
void preempt_notifier_unregister(struct preempt_notifier *not	void preempt_notifier_unregister(struct preempt_notifier *not
{								{
	hlist_del(&notifier->link);					hlist_del(&notifier->link);
}								}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);			EXPORT_SYMBOL_GPL(preempt_notifier_unregister);

static void __fire_sched_in_preempt_notifiers(struct task_str	static void __fire_sched_in_preempt_notifiers(struct task_str
{								{
	struct preempt_notifier *notifier;				struct preempt_notifier *notifier;

	hlist_for_each_entry(notifier, &curr->preempt_notifie		hlist_for_each_entry(notifier, &curr->preempt_notifie
		notifier->ops->sched_in(notifier, raw_smp_pro			notifier->ops->sched_in(notifier, raw_smp_pro
}								}

static __always_inline void fire_sched_in_preempt_notifiers(s	static __always_inline void fire_sched_in_preempt_notifiers(s
{								{
	if (static_branch_unlikely(&preempt_notifier_key))		if (static_branch_unlikely(&preempt_notifier_key))
		__fire_sched_in_preempt_notifiers(curr);			__fire_sched_in_preempt_notifiers(curr);
}								}

static void							static void
__fire_sched_out_preempt_notifiers(struct task_struct *curr,	__fire_sched_out_preempt_notifiers(struct task_struct *curr,
				   struct task_struct *next)					   struct task_struct *next)
{								{
	struct preempt_notifier *notifier;				struct preempt_notifier *notifier;

	hlist_for_each_entry(notifier, &curr->preempt_notifie		hlist_for_each_entry(notifier, &curr->preempt_notifie
		notifier->ops->sched_out(notifier, next);			notifier->ops->sched_out(notifier, next);
}								}

static __always_inline void					static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,	fire_sched_out_preempt_notifiers(struct task_struct *curr,
				 struct task_struct *next)					 struct task_struct *next)
{								{
	if (static_branch_unlikely(&preempt_notifier_key))		if (static_branch_unlikely(&preempt_notifier_key))
		__fire_sched_out_preempt_notifiers(curr, next			__fire_sched_out_preempt_notifiers(curr, next
}								}

#else /* !CONFIG_PREEMPT_NOTIFIERS */				#else /* !CONFIG_PREEMPT_NOTIFIERS */

static inline void fire_sched_in_preempt_notifiers(struct tas	static inline void fire_sched_in_preempt_notifiers(struct tas
{								{
}								}

static inline void						static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,	fire_sched_out_preempt_notifiers(struct task_struct *curr,
				 struct task_struct *next)					 struct task_struct *next)
{								{
}								}

#endif /* CONFIG_PREEMPT_NOTIFIERS */				#endif /* CONFIG_PREEMPT_NOTIFIERS */

static inline void prepare_task(struct task_struct *next)	static inline void prepare_task(struct task_struct *next)
{								{
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	/*								/*
	 * Claim the task as running, we do this before switc		 * Claim the task as running, we do this before switc
	 * such that any running task will have this set.		 * such that any running task will have this set.
	 *								 *
	 * See the smp_load_acquire(&p->on_cpu) case in ttwu(		 * See the smp_load_acquire(&p->on_cpu) case in ttwu(
	 * its ordering comment.					 * its ordering comment.
	 */								 */
	WRITE_ONCE(next->on_cpu, 1);					WRITE_ONCE(next->on_cpu, 1);
#endif								#endif
}								}

static inline void finish_task(struct task_struct *prev)	static inline void finish_task(struct task_struct *prev)
{								{
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	/*								/*
	 * This must be the very last reference to @prev from		 * This must be the very last reference to @prev from
	 * p->on_cpu is cleared, the task can be moved to a d		 * p->on_cpu is cleared, the task can be moved to a d
	 * must ensure this doesn't happen until the switch i		 * must ensure this doesn't happen until the switch i
	 * finished.							 * finished.
	 *								 *
	 * In particular, the load of prev->state in finish_t		 * In particular, the load of prev->state in finish_t
	 * happen before this.						 * happen before this.
	 *								 *
	 * Pairs with the smp_cond_load_acquire() in try_to_w		 * Pairs with the smp_cond_load_acquire() in try_to_w
	 */								 */
	smp_store_release(&prev->on_cpu, 0);				smp_store_release(&prev->on_cpu, 0);
#endif								#endif
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP

static void do_balance_callbacks(struct rq *rq, struct balanc	static void do_balance_callbacks(struct rq *rq, struct balanc
{								{
	void (*func)(struct rq *rq);					void (*func)(struct rq *rq);
	struct balance_callback *next;					struct balance_callback *next;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	while (head) {							while (head) {
		func = (void (*)(struct rq *))head->func;			func = (void (*)(struct rq *))head->func;
		next = head->next;						next = head->next;
		head->next = NULL;						head->next = NULL;
		head = next;							head = next;

		func(rq);							func(rq);
	}								}
}								}

static void balance_push(struct rq *rq);			static void balance_push(struct rq *rq);

/*								/*
 * balance_push_callback is a right abuse of the callback int	 * balance_push_callback is a right abuse of the callback int
 * by significantly different rules.				 * by significantly different rules.
 *								 *
 * Where the normal balance_callback's purpose is to be ran i	 * Where the normal balance_callback's purpose is to be ran i
 * that queued it (only later, when it's safe to drop rq->loc	 * that queued it (only later, when it's safe to drop rq->loc
 * balance_push_callback is specifically targeted at __schedu	 * balance_push_callback is specifically targeted at __schedu
 *								 *
 * This abuse is tolerated because it places all the unlikely	 * This abuse is tolerated because it places all the unlikely
 * a single test, namely: rq->balance_callback == NULL.		 * a single test, namely: rq->balance_callback == NULL.
 */								 */
struct balance_callback balance_push_callback = {		struct balance_callback balance_push_callback = {
	.next = NULL,							.next = NULL,
	.func = balance_push,						.func = balance_push,
};								};

static inline struct balance_callback *				static inline struct balance_callback *
__splice_balance_callbacks(struct rq *rq, bool split)		__splice_balance_callbacks(struct rq *rq, bool split)
{								{
	struct balance_callback *head = rq->balance_callback;		struct balance_callback *head = rq->balance_callback;

	if (likely(!head))						if (likely(!head))
		return NULL;							return NULL;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);
	/*								/*
	 * Must not take balance_push_callback off the list w		 * Must not take balance_push_callback off the list w
	 * splice_balance_callbacks() and balance_callbacks()		 * splice_balance_callbacks() and balance_callbacks()
	 * in the same rq->lock section.				 * in the same rq->lock section.
	 *								 *
	 * In that case it would be possible for __schedule()		 * In that case it would be possible for __schedule()
	 * and observe the list empty.					 * and observe the list empty.
	 */								 */
	if (split && head == &balance_push_callback)			if (split && head == &balance_push_callback)
		head = NULL;							head = NULL;
	else								else
		rq->balance_callback = NULL;					rq->balance_callback = NULL;

	return head;							return head;
}								}

static inline struct balance_callback *splice_balance_callbac	static inline struct balance_callback *splice_balance_callbac
{								{
	return __splice_balance_callbacks(rq, true);			return __splice_balance_callbacks(rq, true);
}								}

static void __balance_callbacks(struct rq *rq)			static void __balance_callbacks(struct rq *rq)
{								{
	do_balance_callbacks(rq, __splice_balance_callbacks(r		do_balance_callbacks(rq, __splice_balance_callbacks(r
}								}

static inline void balance_callbacks(struct rq *rq, struct ba	static inline void balance_callbacks(struct rq *rq, struct ba
{								{
	unsigned long flags;						unsigned long flags;

	if (unlikely(head)) {						if (unlikely(head)) {
		raw_spin_rq_lock_irqsave(rq, flags);				raw_spin_rq_lock_irqsave(rq, flags);
		do_balance_callbacks(rq, head);					do_balance_callbacks(rq, head);
		raw_spin_rq_unlock_irqrestore(rq, flags);			raw_spin_rq_unlock_irqrestore(rq, flags);
	}								}
}								}

#else								#else

static inline void __balance_callbacks(struct rq *rq)		static inline void __balance_callbacks(struct rq *rq)
{								{
}								}

static inline struct balance_callback *splice_balance_callbac	static inline struct balance_callback *splice_balance_callbac
{								{
	return NULL;							return NULL;
}								}

static inline void balance_callbacks(struct rq *rq, struct ba	static inline void balance_callbacks(struct rq *rq, struct ba
{								{
}								}

#endif								#endif

static inline void						static inline void
prepare_lock_switch(struct rq *rq, struct task_struct *next, 	prepare_lock_switch(struct rq *rq, struct task_struct *next, 
{								{
	/*								/*
	 * Since the runqueue lock will be released by the ne		 * Since the runqueue lock will be released by the ne
	 * task (which is an invalid locking op but in the ca		 * task (which is an invalid locking op but in the ca
	 * of the scheduler it's an obvious special-case), so		 * of the scheduler it's an obvious special-case), so
	 * do an early lockdep release here:				 * do an early lockdep release here:
	 */								 */
	rq_unpin_lock(rq, rf);						rq_unpin_lock(rq, rf);
	spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);		spin_release(&__rq_lockp(rq)->dep_map, _THIS_IP_);
#ifdef CONFIG_DEBUG_SPINLOCK					#ifdef CONFIG_DEBUG_SPINLOCK
	/* this is a valid case when another task releases th		/* this is a valid case when another task releases th
	rq_lockp(rq)->owner = next;					rq_lockp(rq)->owner = next;
#endif								#endif
}								}

static inline void finish_lock_switch(struct rq *rq)		static inline void finish_lock_switch(struct rq *rq)
{								{
	/*								/*
	 * If we are tracking spinlock dependencies then we h		 * If we are tracking spinlock dependencies then we h
	 * fix up the runqueue lock - which gets 'carried ove		 * fix up the runqueue lock - which gets 'carried ove
	 * prev into current:						 * prev into current:
	 */								 */
	spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP		spin_acquire(&__rq_lockp(rq)->dep_map, 0, 0, _THIS_IP
	__balance_callbacks(rq);					__balance_callbacks(rq);
	raw_spin_rq_unlock_irq(rq);					raw_spin_rq_unlock_irq(rq);
}								}

/*								/*
 * NOP if the arch has not defined these:			 * NOP if the arch has not defined these:
 */								 */

#ifndef prepare_arch_switch					#ifndef prepare_arch_switch
# define prepare_arch_switch(next)	do { } while (0)	# define prepare_arch_switch(next)	do { } while (0)
#endif								#endif

#ifndef finish_arch_post_lock_switch				#ifndef finish_arch_post_lock_switch
# define finish_arch_post_lock_switch()	do { } while (0)	# define finish_arch_post_lock_switch()	do { } while (0)
#endif								#endif

static inline void kmap_local_sched_out(void)			static inline void kmap_local_sched_out(void)
{								{
#ifdef CONFIG_KMAP_LOCAL					#ifdef CONFIG_KMAP_LOCAL
	if (unlikely(current->kmap_ctrl.idx))				if (unlikely(current->kmap_ctrl.idx))
		__kmap_local_sched_out();					__kmap_local_sched_out();
#endif								#endif
}								}

static inline void kmap_local_sched_in(void)			static inline void kmap_local_sched_in(void)
{								{
#ifdef CONFIG_KMAP_LOCAL					#ifdef CONFIG_KMAP_LOCAL
	if (unlikely(current->kmap_ctrl.idx))				if (unlikely(current->kmap_ctrl.idx))
		__kmap_local_sched_in();					__kmap_local_sched_in();
#endif								#endif
}								}

/**								/**
 * prepare_task_switch - prepare to switch tasks		 * prepare_task_switch - prepare to switch tasks
 * @rq: the runqueue preparing to switch			 * @rq: the runqueue preparing to switch
 * @prev: the current task that is being switched out		 * @prev: the current task that is being switched out
 * @next: the task we are going to switch to.			 * @next: the task we are going to switch to.
 *								 *
 * This is called with the rq lock held and interrupts off. I	 * This is called with the rq lock held and interrupts off. I
 * be paired with a subsequent finish_task_switch after the c	 * be paired with a subsequent finish_task_switch after the c
 * switch.							 * switch.
 *								 *
 * prepare_task_switch sets up locking and calls architecture	 * prepare_task_switch sets up locking and calls architecture
 * hooks.							 * hooks.
 */								 */
static inline void						static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,	prepare_task_switch(struct rq *rq, struct task_struct *prev,
		    struct task_struct *next)					    struct task_struct *next)
{								{
	kcov_prepare_switch(prev);					kcov_prepare_switch(prev);
	sched_info_switch(rq, prev, next);				sched_info_switch(rq, prev, next);
	perf_event_task_sched_out(prev, next);				perf_event_task_sched_out(prev, next);
	rseq_preempt(prev);						rseq_preempt(prev);
	fire_sched_out_preempt_notifiers(prev, next);			fire_sched_out_preempt_notifiers(prev, next);
	kmap_local_sched_out();						kmap_local_sched_out();
	prepare_task(next);						prepare_task(next);
	prepare_arch_switch(next);					prepare_arch_switch(next);
}								}

/**								/**
 * finish_task_switch - clean up after a task-switch		 * finish_task_switch - clean up after a task-switch
 * @prev: the thread we just switched away from.		 * @prev: the thread we just switched away from.
 *								 *
 * finish_task_switch must be called after the context switch	 * finish_task_switch must be called after the context switch
 * with a prepare_task_switch call before the context switch.	 * with a prepare_task_switch call before the context switch.
 * finish_task_switch will reconcile locking set up by prepar	 * finish_task_switch will reconcile locking set up by prepar
 * and do any other architecture-specific cleanup actions.	 * and do any other architecture-specific cleanup actions.
 *								 *
 * Note that we may have delayed dropping an mm in context_sw	 * Note that we may have delayed dropping an mm in context_sw
 * so, we finish that here outside of the runqueue lock. (Doi	 * so, we finish that here outside of the runqueue lock. (Doi
 * with the lock held can cause deadlocks; see schedule() for	 * with the lock held can cause deadlocks; see schedule() for
 * details.)							 * details.)
 *								 *
 * The context switch have flipped the stack from under us an	 * The context switch have flipped the stack from under us an
 * local variables which were saved when this task called sch	 * local variables which were saved when this task called sch
 * past. prev == current is still correct but we need to reca	 * past. prev == current is still correct but we need to reca
 * because prev may have moved to another CPU.			 * because prev may have moved to another CPU.
 */								 */
static struct rq *finish_task_switch(struct task_struct *prev	static struct rq *finish_task_switch(struct task_struct *prev
	__releases(rq->lock)						__releases(rq->lock)
{								{
	struct rq *rq = this_rq();					struct rq *rq = this_rq();
	struct mm_struct *mm = rq->prev_mm;				struct mm_struct *mm = rq->prev_mm;
	unsigned int prev_state;					unsigned int prev_state;

	/*								/*
	 * The previous task will have left us with a preempt		 * The previous task will have left us with a preempt
	 * because it left us after:					 * because it left us after:
	 *								 *
	 *	schedule()						 *	schedule()
	 *	  preempt_disable();			// 1		 *	  preempt_disable();			// 1
	 *	  __schedule()						 *	  __schedule()
	 *	    raw_spin_lock_irq(&rq->lock)	// 2		 *	    raw_spin_lock_irq(&rq->lock)	// 2
	 *								 *
	 * Also, see FORK_PREEMPT_COUNT.				 * Also, see FORK_PREEMPT_COUNT.
	 */								 */
	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OF		if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OF
		      "corrupted preempt_count: %s/%d/0x%x\n"			      "corrupted preempt_count: %s/%d/0x%x\n"
		      current->comm, current->pid, preempt_co			      current->comm, current->pid, preempt_co
		preempt_count_set(FORK_PREEMPT_COUNT);				preempt_count_set(FORK_PREEMPT_COUNT);

	rq->prev_mm = NULL;						rq->prev_mm = NULL;

	/*								/*
	 * A task struct has one reference for the use as "cu		 * A task struct has one reference for the use as "cu
	 * If a task dies, then it sets TASK_DEAD in tsk->sta		 * If a task dies, then it sets TASK_DEAD in tsk->sta
	 * schedule one last time. The schedule call will nev		 * schedule one last time. The schedule call will nev
	 * the scheduled task must drop that reference.			 * the scheduled task must drop that reference.
	 *								 *
	 * We must observe prev->state before clearing prev->		 * We must observe prev->state before clearing prev->
	 * finish_task), otherwise a concurrent wakeup can ge		 * finish_task), otherwise a concurrent wakeup can ge
	 * running on another CPU and we could rave with its 		 * running on another CPU and we could rave with its 
	 * transition, resulting in a double drop.			 * transition, resulting in a double drop.
	 */								 */
	prev_state = READ_ONCE(prev->__state);				prev_state = READ_ONCE(prev->__state);
	vtime_task_switch(prev);					vtime_task_switch(prev);
	perf_event_task_sched_in(prev, current);			perf_event_task_sched_in(prev, current);
	finish_task(prev);						finish_task(prev);
	tick_nohz_task_switch();					tick_nohz_task_switch();
	finish_lock_switch(rq);						finish_lock_switch(rq);
	finish_arch_post_lock_switch();					finish_arch_post_lock_switch();
	kcov_finish_switch(current);					kcov_finish_switch(current);
	/*								/*
	 * kmap_local_sched_out() is invoked with rq::lock he		 * kmap_local_sched_out() is invoked with rq::lock he
	 * interrupts disabled. There is no requirement for t		 * interrupts disabled. There is no requirement for t
	 * sched out code does not have an interrupt enabled 		 * sched out code does not have an interrupt enabled 
	 * Restoring the maps on sched in does not require in		 * Restoring the maps on sched in does not require in
	 * disabled either.						 * disabled either.
	 */								 */
	kmap_local_sched_in();						kmap_local_sched_in();

	fire_sched_in_preempt_notifiers(current);			fire_sched_in_preempt_notifiers(current);
	/*								/*
	 * When switching through a kernel thread, the loop i		 * When switching through a kernel thread, the loop i
	 * membarrier_{private,global}_expedited() may have o		 * membarrier_{private,global}_expedited() may have o
	 * kernel thread and not issued an IPI. It is therefo		 * kernel thread and not issued an IPI. It is therefo
	 * schedule between user->kernel->user threads withou		 * schedule between user->kernel->user threads withou
	 * switch_mm(). Membarrier requires a barrier after s		 * switch_mm(). Membarrier requires a barrier after s
	 * rq->curr, before returning to userspace, so provid		 * rq->curr, before returning to userspace, so provid
	 *								 *
	 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPED		 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPED
	 *   provided by mmdrop_lazy_tlb(),				 *   provided by mmdrop_lazy_tlb(),
	 * - a sync_core for SYNC_CORE.					 * - a sync_core for SYNC_CORE.
	 */								 */
	if (mm) {							if (mm) {
		membarrier_mm_sync_core_before_usermode(mm);			membarrier_mm_sync_core_before_usermode(mm);
		mmdrop_lazy_tlb_sched(mm);					mmdrop_lazy_tlb_sched(mm);
	}								}

	if (unlikely(prev_state == TASK_DEAD)) {			if (unlikely(prev_state == TASK_DEAD)) {
		if (prev->sched_class->task_dead)				if (prev->sched_class->task_dead)
			prev->sched_class->task_dead(prev);				prev->sched_class->task_dead(prev);

		/* Task is done with its stack. */				/* Task is done with its stack. */
		put_task_stack(prev);						put_task_stack(prev);

		put_task_struct_rcu_user(prev);					put_task_struct_rcu_user(prev);
	}								}

	return rq;							return rq;
}								}

/**								/**
 * schedule_tail - first thing a freshly forked thread must c	 * schedule_tail - first thing a freshly forked thread must c
 * @prev: the thread we just switched away from.		 * @prev: the thread we just switched away from.
 */								 */
asmlinkage __visible void schedule_tail(struct task_struct *p	asmlinkage __visible void schedule_tail(struct task_struct *p
	__releases(rq->lock)						__releases(rq->lock)
{								{
	/*								/*
	 * New tasks start with FORK_PREEMPT_COUNT, see there		 * New tasks start with FORK_PREEMPT_COUNT, see there
	 * finish_task_switch() for details.				 * finish_task_switch() for details.
	 *								 *
	 * finish_task_switch() will drop rq->lock() and lowe		 * finish_task_switch() will drop rq->lock() and lowe
	 * and the preempt_enable() will end up enabling pree		 * and the preempt_enable() will end up enabling pree
	 * PREEMPT_COUNT kernels).					 * PREEMPT_COUNT kernels).
	 */								 */

	finish_task_switch(prev);					finish_task_switch(prev);
	preempt_enable();						preempt_enable();

	if (current->set_child_tid)					if (current->set_child_tid)
		put_user(task_pid_vnr(current), current->set_			put_user(task_pid_vnr(current), current->set_

	calculate_sigpending();						calculate_sigpending();
}								}

/*								/*
 * context_switch - switch to the new MM and the new thread's	 * context_switch - switch to the new MM and the new thread's
 */								 */
static __always_inline struct rq *				static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,		context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next, struct rq_flags *rf)		       struct task_struct *next, struct rq_flags *rf)
{								{
	prepare_task_switch(rq, prev, next);				prepare_task_switch(rq, prev, next);

	/*								/*
	 * For paravirt, this is coupled with an exit in swit		 * For paravirt, this is coupled with an exit in swit
	 * combine the page table reload and the switch backe		 * combine the page table reload and the switch backe
	 * one hypercall.						 * one hypercall.
	 */								 */
	arch_start_context_switch(prev);				arch_start_context_switch(prev);

	/*								/*
	 * kernel -> kernel   lazy + transfer active			 * kernel -> kernel   lazy + transfer active
	 *   user -> kernel   lazy + mmgrab_lazy_tlb() active		 *   user -> kernel   lazy + mmgrab_lazy_tlb() active
	 *								 *
	 * kernel ->   user   switch + mmdrop_lazy_tlb() acti		 * kernel ->   user   switch + mmdrop_lazy_tlb() acti
	 *   user ->   user   switch					 *   user ->   user   switch
	 *								 *
	 * switch_mm_cid() needs to be updated if the barrier		 * switch_mm_cid() needs to be updated if the barrier
	 * by context_switch() are modified.				 * by context_switch() are modified.
	 */								 */
	if (!next->mm) {                                // to		if (!next->mm) {                                // to
		enter_lazy_tlb(prev->active_mm, next);				enter_lazy_tlb(prev->active_mm, next);

		next->active_mm = prev->active_mm;				next->active_mm = prev->active_mm;
		if (prev->mm)                           // fr			if (prev->mm)                           // fr
			mmgrab_lazy_tlb(prev->active_mm);				mmgrab_lazy_tlb(prev->active_mm);
		else								else
			prev->active_mm = NULL;						prev->active_mm = NULL;
	} else {                                        // to		} else {                                        // to
		membarrier_switch_mm(rq, prev->active_mm, nex			membarrier_switch_mm(rq, prev->active_mm, nex
		/*								/*
		 * sys_membarrier() requires an smp_mb() betw			 * sys_membarrier() requires an smp_mb() betw
		 * rq->curr / membarrier_switch_mm() and retu			 * rq->curr / membarrier_switch_mm() and retu
		 *								 *
		 * The below provides this either through swi			 * The below provides this either through swi
		 * case 'prev->active_mm == next->mm' through			 * case 'prev->active_mm == next->mm' through
		 * finish_task_switch()'s mmdrop().				 * finish_task_switch()'s mmdrop().
		 */								 */
		switch_mm_irqs_off(prev->active_mm, next->mm,			switch_mm_irqs_off(prev->active_mm, next->mm,
		lru_gen_use_mm(next->mm);					lru_gen_use_mm(next->mm);

		if (!prev->mm) {                        // fr			if (!prev->mm) {                        // fr
			/* will mmdrop_lazy_tlb() in finish_t				/* will mmdrop_lazy_tlb() in finish_t
			rq->prev_mm = prev->active_mm;					rq->prev_mm = prev->active_mm;
			prev->active_mm = NULL;						prev->active_mm = NULL;
		}								}
	}								}

	/* switch_mm_cid() requires the memory barriers above		/* switch_mm_cid() requires the memory barriers above
	switch_mm_cid(rq, prev, next);					switch_mm_cid(rq, prev, next);

	prepare_lock_switch(rq, next, rf);				prepare_lock_switch(rq, next, rf);

	/* Here we just switch the register state and the sta		/* Here we just switch the register state and the sta
	switch_to(prev, next, prev);					switch_to(prev, next, prev);
	barrier();							barrier();

	return finish_task_switch(prev);				return finish_task_switch(prev);
}								}

/*								/*
 * nr_running and nr_context_switches:				 * nr_running and nr_context_switches:
 *								 *
 * externally visible scheduler statistics: current number of	 * externally visible scheduler statistics: current number of
 * threads, total number of context switches performed since 	 * threads, total number of context switches performed since 
 */								 */
unsigned int nr_running(void)					unsigned int nr_running(void)
{								{
	unsigned int i, sum = 0;					unsigned int i, sum = 0;

	for_each_online_cpu(i)						for_each_online_cpu(i)
		sum += cpu_rq(i)->nr_running;					sum += cpu_rq(i)->nr_running;

	return sum;							return sum;
}								}

/*								/*
 * Check if only the current task is running on the CPU.	 * Check if only the current task is running on the CPU.
 *								 *
 * Caution: this function does not check that the caller has 	 * Caution: this function does not check that the caller has 
 * preemption, thus the result might have a time-of-check-to-	 * preemption, thus the result might have a time-of-check-to-
 * race.  The caller is responsible to use it correctly, for 	 * race.  The caller is responsible to use it correctly, for 
 *								 *
 * - from a non-preemptible section (of course)			 * - from a non-preemptible section (of course)
 *								 *
 * - from a thread that is bound to a single CPU		 * - from a thread that is bound to a single CPU
 *								 *
 * - in a loop with very short iterations (e.g. a polling loo	 * - in a loop with very short iterations (e.g. a polling loo
 */								 */
bool single_task_running(void)					bool single_task_running(void)
{								{
	return raw_rq()->nr_running == 1;				return raw_rq()->nr_running == 1;
}								}
EXPORT_SYMBOL(single_task_running);				EXPORT_SYMBOL(single_task_running);

unsigned long long nr_context_switches_cpu(int cpu)		unsigned long long nr_context_switches_cpu(int cpu)
{								{
	return cpu_rq(cpu)->nr_switches;				return cpu_rq(cpu)->nr_switches;
}								}

unsigned long long nr_context_switches(void)			unsigned long long nr_context_switches(void)
{								{
	int i;								int i;
	unsigned long long sum = 0;					unsigned long long sum = 0;

	for_each_possible_cpu(i)					for_each_possible_cpu(i)
		sum += cpu_rq(i)->nr_switches;					sum += cpu_rq(i)->nr_switches;

	return sum;							return sum;
}								}

/*								/*
 * Consumers of these two interfaces, like for example the cp	 * Consumers of these two interfaces, like for example the cp
 * governor, are using nonsensical data. Preferring shallow i	 * governor, are using nonsensical data. Preferring shallow i
 * for a CPU that has IO-wait which might not even end up run	 * for a CPU that has IO-wait which might not even end up run
 * it does become runnable.					 * it does become runnable.
 */								 */

unsigned int nr_iowait_cpu(int cpu)				unsigned int nr_iowait_cpu(int cpu)
{								{
	return atomic_read(&cpu_rq(cpu)->nr_iowait);			return atomic_read(&cpu_rq(cpu)->nr_iowait);
}								}

/*								/*
 * IO-wait accounting, and how it's mostly bollocks (on SMP).	 * IO-wait accounting, and how it's mostly bollocks (on SMP).
 *								 *
 * The idea behind IO-wait account is to account the idle tim	 * The idea behind IO-wait account is to account the idle tim
 * have spend running if it were not for IO. That is, if we w	 * have spend running if it were not for IO. That is, if we w
 * storage performance, we'd have a proportional reduction in	 * storage performance, we'd have a proportional reduction in
 *								 *
 * This all works nicely on UP, where, when a task blocks on 	 * This all works nicely on UP, where, when a task blocks on 
 * idle time as IO-wait, because if the storage were faster, 	 * idle time as IO-wait, because if the storage were faster, 
 * running and we'd not be idle.				 * running and we'd not be idle.
 *								 *
 * This has been extended to SMP, by doing the same for each 	 * This has been extended to SMP, by doing the same for each 
 * is broken.							 * is broken.
 *								 *
 * Imagine for instance the case where two tasks block on one	 * Imagine for instance the case where two tasks block on one
 * CPU will have IO-wait accounted, while the other has regul	 * CPU will have IO-wait accounted, while the other has regul
 * though, if the storage were faster, both could've ran at t	 * though, if the storage were faster, both could've ran at t
 * utilising both CPUs.						 * utilising both CPUs.
 *								 *
 * This means, that when looking globally, the current IO-wai	 * This means, that when looking globally, the current IO-wai
 * SMP is a lower bound, by reason of under accounting.		 * SMP is a lower bound, by reason of under accounting.
 *								 *
 * Worse, since the numbers are provided per CPU, they are so	 * Worse, since the numbers are provided per CPU, they are so
 * interpreted per CPU, and that is nonsensical. A blocked ta	 * interpreted per CPU, and that is nonsensical. A blocked ta
 * associated with any one particular CPU, it can wake to ano	 * associated with any one particular CPU, it can wake to ano
 * blocked on. This means the per CPU IO-wait number is meani	 * blocked on. This means the per CPU IO-wait number is meani
 *								 *
 * Task CPU affinities can make all that even more 'interesti	 * Task CPU affinities can make all that even more 'interesti
 */								 */

unsigned int nr_iowait(void)					unsigned int nr_iowait(void)
{								{
	unsigned int i, sum = 0;					unsigned int i, sum = 0;

	for_each_possible_cpu(i)					for_each_possible_cpu(i)
		sum += nr_iowait_cpu(i);					sum += nr_iowait_cpu(i);

	return sum;							return sum;
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP

/*								/*
 * sched_exec - execve() is a valuable balancing opportunity,	 * sched_exec - execve() is a valuable balancing opportunity,
 * this point the task has the smallest effective memory and 	 * this point the task has the smallest effective memory and 
 */								 */
void sched_exec(void)						void sched_exec(void)
{								{
	struct task_struct *p = current;				struct task_struct *p = current;
	struct migration_arg arg;					struct migration_arg arg;
	int dest_cpu;							int dest_cpu;

	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {		scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
		dest_cpu = p->sched_class->select_task_rq(p, 			dest_cpu = p->sched_class->select_task_rq(p, 
		if (dest_cpu == smp_processor_id())				if (dest_cpu == smp_processor_id())
			return;								return;

		if (unlikely(!cpu_active(dest_cpu)))				if (unlikely(!cpu_active(dest_cpu)))
			return;								return;

		arg = (struct migration_arg){ p, dest_cpu };			arg = (struct migration_arg){ p, dest_cpu };
	}								}
	stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);		stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
}								}

#endif								#endif

DEFINE_PER_CPU(struct kernel_stat, kstat);			DEFINE_PER_CPU(struct kernel_stat, kstat);
DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);		DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);

EXPORT_PER_CPU_SYMBOL(kstat);					EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);				EXPORT_PER_CPU_SYMBOL(kernel_cpustat);

/*								/*
 * The function fair_sched_class.update_curr accesses the str	 * The function fair_sched_class.update_curr accesses the str
 * and its field curr->exec_start; when called from task_sche	 * and its field curr->exec_start; when called from task_sche
 * we observe a high rate of cache misses in practice.		 * we observe a high rate of cache misses in practice.
 * Prefetching this data results in improved performance.	 * Prefetching this data results in improved performance.
 */								 */
static inline void prefetch_curr_exec_start(struct task_struc	static inline void prefetch_curr_exec_start(struct task_struc
{								{
#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
	struct sched_entity *curr = (&p->se)->cfs_rq->curr;		struct sched_entity *curr = (&p->se)->cfs_rq->curr;
#else								#else
	struct sched_entity *curr = (&task_rq(p)->cfs)->curr;		struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
#endif								#endif
	prefetch(curr);							prefetch(curr);
	prefetch(&curr->exec_start);					prefetch(&curr->exec_start);
}								}

/*								/*
 * Return accounted runtime for the task.			 * Return accounted runtime for the task.
 * In case the task is currently running, return the runtime 	 * In case the task is currently running, return the runtime 
 * pending runtime that have not been accounted yet.		 * pending runtime that have not been accounted yet.
 */								 */
unsigned long long task_sched_runtime(struct task_struct *p)	unsigned long long task_sched_runtime(struct task_struct *p)
{								{
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;
	u64 ns;								u64 ns;

#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)		#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
	/*								/*
	 * 64-bit doesn't need locks to atomically read a 64-		 * 64-bit doesn't need locks to atomically read a 64-
	 * So we have a optimization chance when the task's d		 * So we have a optimization chance when the task's d
	 * Reading ->on_cpu is racy, but this is ok.			 * Reading ->on_cpu is racy, but this is ok.
	 *								 *
	 * If we race with it leaving CPU, we'll take a lock.		 * If we race with it leaving CPU, we'll take a lock.
	 * If we race with it entering CPU, unaccounted time 		 * If we race with it entering CPU, unaccounted time 
	 * indistinguishable from the read occurring a few cy		 * indistinguishable from the read occurring a few cy
	 * If we see ->on_cpu without ->on_rq, the task is le		 * If we see ->on_cpu without ->on_rq, the task is le
	 * been accounted, so we're correct here as well.		 * been accounted, so we're correct here as well.
	 */								 */
	if (!p->on_cpu || !task_on_rq_queued(p))			if (!p->on_cpu || !task_on_rq_queued(p))
		return p->se.sum_exec_runtime;					return p->se.sum_exec_runtime;
#endif								#endif

	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
	/*								/*
	 * Must be ->curr _and_ ->on_rq.  If dequeued, we wou		 * Must be ->curr _and_ ->on_rq.  If dequeued, we wou
	 * project cycles that may never be accounted to this		 * project cycles that may never be accounted to this
	 * thread, breaking clock_gettime().				 * thread, breaking clock_gettime().
	 */								 */
	if (task_current(rq, p) && task_on_rq_queued(p)) {		if (task_current(rq, p) && task_on_rq_queued(p)) {
		prefetch_curr_exec_start(p);					prefetch_curr_exec_start(p);
		update_rq_clock(rq);						update_rq_clock(rq);
		p->sched_class->update_curr(rq);				p->sched_class->update_curr(rq);
	}								}
	ns = p->se.sum_exec_runtime;					ns = p->se.sum_exec_runtime;
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);

	return ns;							return ns;
}								}

#ifdef CONFIG_SCHED_DEBUG					#ifdef CONFIG_SCHED_DEBUG
static u64 cpu_resched_latency(struct rq *rq)			static u64 cpu_resched_latency(struct rq *rq)
{								{
	int latency_warn_ms = READ_ONCE(sysctl_resched_latenc		int latency_warn_ms = READ_ONCE(sysctl_resched_latenc
	u64 resched_latency, now = rq_clock(rq);			u64 resched_latency, now = rq_clock(rq);
	static bool warned_once;					static bool warned_once;

	if (sysctl_resched_latency_warn_once && warned_once)		if (sysctl_resched_latency_warn_once && warned_once)
		return 0;							return 0;

	if (!need_resched() || !latency_warn_ms)			if (!need_resched() || !latency_warn_ms)
		return 0;							return 0;

	if (system_state == SYSTEM_BOOTING)				if (system_state == SYSTEM_BOOTING)
		return 0;							return 0;

	if (!rq->last_seen_need_resched_ns) {				if (!rq->last_seen_need_resched_ns) {
		rq->last_seen_need_resched_ns = now;				rq->last_seen_need_resched_ns = now;
		rq->ticks_without_resched = 0;					rq->ticks_without_resched = 0;
		return 0;							return 0;
	}								}

	rq->ticks_without_resched++;					rq->ticks_without_resched++;
	resched_latency = now - rq->last_seen_need_resched_ns		resched_latency = now - rq->last_seen_need_resched_ns
	if (resched_latency <= latency_warn_ms * NSEC_PER_MSE		if (resched_latency <= latency_warn_ms * NSEC_PER_MSE
		return 0;							return 0;

	warned_once = true;						warned_once = true;

	return resched_latency;						return resched_latency;
}								}

static int __init setup_resched_latency_warn_ms(char *str)	static int __init setup_resched_latency_warn_ms(char *str)
{								{
	long val;							long val;

	if ((kstrtol(str, 0, &val))) {					if ((kstrtol(str, 0, &val))) {
		pr_warn("Unable to set resched_latency_warn_m			pr_warn("Unable to set resched_latency_warn_m
		return 1;							return 1;
	}								}

	sysctl_resched_latency_warn_ms = val;				sysctl_resched_latency_warn_ms = val;
	return 1;							return 1;
}								}
__setup("resched_latency_warn_ms=", setup_resched_latency_war	__setup("resched_latency_warn_ms=", setup_resched_latency_war
#else								#else
static inline u64 cpu_resched_latency(struct rq *rq) { return	static inline u64 cpu_resched_latency(struct rq *rq) { return
#endif /* CONFIG_SCHED_DEBUG */					#endif /* CONFIG_SCHED_DEBUG */

/*								/*
 * This function gets called by the timer code, with HZ frequ	 * This function gets called by the timer code, with HZ frequ
 * We call it with interrupts disabled.				 * We call it with interrupts disabled.
 */								 */
void scheduler_tick(void)					void scheduler_tick(void)
{								{
	int cpu = smp_processor_id();					int cpu = smp_processor_id();
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct task_struct *curr = rq->curr;				struct task_struct *curr = rq->curr;
	struct rq_flags rf;						struct rq_flags rf;
	unsigned long thermal_pressure;					unsigned long thermal_pressure;
	u64 resched_latency;						u64 resched_latency;

	if (housekeeping_cpu(cpu, HK_TYPE_TICK))			if (housekeeping_cpu(cpu, HK_TYPE_TICK))
		arch_scale_freq_tick();						arch_scale_freq_tick();

	sched_clock_tick();						sched_clock_tick();

	rq_lock(rq, &rf);						rq_lock(rq, &rf);

	update_rq_clock(rq);						update_rq_clock(rq);
	thermal_pressure = arch_scale_thermal_pressure(cpu_of		thermal_pressure = arch_scale_thermal_pressure(cpu_of
	update_thermal_load_avg(rq_clock_thermal(rq), rq, the		update_thermal_load_avg(rq_clock_thermal(rq), rq, the
	curr->sched_class->task_tick(rq, curr, 0);			curr->sched_class->task_tick(rq, curr, 0);
	if (sched_feat(LATENCY_WARN))					if (sched_feat(LATENCY_WARN))
		resched_latency = cpu_resched_latency(rq);			resched_latency = cpu_resched_latency(rq);
	calc_global_load_tick(rq);					calc_global_load_tick(rq);
	sched_core_tick(rq);						sched_core_tick(rq);
	task_tick_mm_cid(rq, curr);					task_tick_mm_cid(rq, curr);

	rq_unlock(rq, &rf);						rq_unlock(rq, &rf);

	if (sched_feat(LATENCY_WARN) && resched_latency)		if (sched_feat(LATENCY_WARN) && resched_latency)
		resched_latency_warn(cpu, resched_latency);			resched_latency_warn(cpu, resched_latency);

	perf_event_task_tick();						perf_event_task_tick();

	if (curr->flags & PF_WQ_WORKER)					if (curr->flags & PF_WQ_WORKER)
		wq_worker_tick(curr);						wq_worker_tick(curr);

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	rq->idle_balance = idle_cpu(cpu);				rq->idle_balance = idle_cpu(cpu);
	trigger_load_balance(rq);					trigger_load_balance(rq);
#endif								#endif
}								}

#ifdef CONFIG_NO_HZ_FULL					#ifdef CONFIG_NO_HZ_FULL

struct tick_work {						struct tick_work {
	int			cpu;					int			cpu;
	atomic_t		state;					atomic_t		state;
	struct delayed_work	work;					struct delayed_work	work;
};								};
/* Values for ->state, see diagram below. */			/* Values for ->state, see diagram below. */
#define TICK_SCHED_REMOTE_OFFLINE	0			#define TICK_SCHED_REMOTE_OFFLINE	0
#define TICK_SCHED_REMOTE_OFFLINING	1			#define TICK_SCHED_REMOTE_OFFLINING	1
#define TICK_SCHED_REMOTE_RUNNING	2			#define TICK_SCHED_REMOTE_RUNNING	2

/*								/*
 * State diagram for ->state:					 * State diagram for ->state:
 *								 *
 *								 *
 *          TICK_SCHED_REMOTE_OFFLINE				 *          TICK_SCHED_REMOTE_OFFLINE
 *                    |   ^					 *                    |   ^
 *                    |   |					 *                    |   |
 *                    |   | sched_tick_remote()			 *                    |   | sched_tick_remote()
 *                    |   |					 *                    |   |
 *                    |   |					 *                    |   |
 *                    +--TICK_SCHED_REMOTE_OFFLINING		 *                    +--TICK_SCHED_REMOTE_OFFLINING
 *                    |   ^					 *                    |   ^
 *                    |   |					 *                    |   |
 * sched_tick_start() |   | sched_tick_stop()			 * sched_tick_start() |   | sched_tick_stop()
 *                    |   |					 *                    |   |
 *                    V   |					 *                    V   |
 *          TICK_SCHED_REMOTE_RUNNING				 *          TICK_SCHED_REMOTE_RUNNING
 *								 *
 *								 *
 * Other transitions get WARN_ON_ONCE(), except that sched_ti	 * Other transitions get WARN_ON_ONCE(), except that sched_ti
 * and sched_tick_start() are happy to leave the state in RUN	 * and sched_tick_start() are happy to leave the state in RUN
 */								 */

static struct tick_work __percpu *tick_work_cpu;		static struct tick_work __percpu *tick_work_cpu;

static void sched_tick_remote(struct work_struct *work)		static void sched_tick_remote(struct work_struct *work)
{								{
	struct delayed_work *dwork = to_delayed_work(work);		struct delayed_work *dwork = to_delayed_work(work);
	struct tick_work *twork = container_of(dwork, struct 		struct tick_work *twork = container_of(dwork, struct 
	int cpu = twork->cpu;						int cpu = twork->cpu;
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	int os;								int os;

	/*								/*
	 * Handle the tick only if it appears the remote CPU 		 * Handle the tick only if it appears the remote CPU 
	 * dynticks mode. The check is racy by nature, but mi		 * dynticks mode. The check is racy by nature, but mi
	 * having one too much is no big deal because the sch		 * having one too much is no big deal because the sch
	 * statistics and checks timeslices in a time-indepen		 * statistics and checks timeslices in a time-indepen
	 * of when exactly it is running.				 * of when exactly it is running.
	 */								 */
	if (tick_nohz_tick_stopped_cpu(cpu)) {				if (tick_nohz_tick_stopped_cpu(cpu)) {
		guard(rq_lock_irq)(rq);						guard(rq_lock_irq)(rq);
		struct task_struct *curr = rq->curr;				struct task_struct *curr = rq->curr;

		if (cpu_online(cpu)) {						if (cpu_online(cpu)) {
			update_rq_clock(rq);						update_rq_clock(rq);

			if (!is_idle_task(curr)) {					if (!is_idle_task(curr)) {
				/*								/*
				 * Make sure the next tick ru					 * Make sure the next tick ru
				 * reasonable amount of time.					 * reasonable amount of time.
				 */								 */
				u64 delta = rq_clock_task(rq)					u64 delta = rq_clock_task(rq)
				WARN_ON_ONCE(delta > (u64)NSE					WARN_ON_ONCE(delta > (u64)NSE
			}								}
			curr->sched_class->task_tick(rq, curr				curr->sched_class->task_tick(rq, curr

			calc_load_nohz_remote(rq);					calc_load_nohz_remote(rq);
		}								}
	}								}

	/*								/*
	 * Run the remote tick once per second (1Hz). This ar		 * Run the remote tick once per second (1Hz). This ar
	 * frequency is large enough to avoid overload but sh		 * frequency is large enough to avoid overload but sh
	 * to keep scheduler internal stats reasonably up to 		 * to keep scheduler internal stats reasonably up to 
	 * first update state to reflect hotplug activity if 		 * first update state to reflect hotplug activity if 
	 */								 */
	os = atomic_fetch_add_unless(&twork->state, -1, TICK_		os = atomic_fetch_add_unless(&twork->state, -1, TICK_
	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);			WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
	if (os == TICK_SCHED_REMOTE_RUNNING)				if (os == TICK_SCHED_REMOTE_RUNNING)
		queue_delayed_work(system_unbound_wq, dwork, 			queue_delayed_work(system_unbound_wq, dwork, 
}								}

static void sched_tick_start(int cpu)				static void sched_tick_start(int cpu)
{								{
	int os;								int os;
	struct tick_work *twork;					struct tick_work *twork;

	if (housekeeping_cpu(cpu, HK_TYPE_TICK))			if (housekeeping_cpu(cpu, HK_TYPE_TICK))
		return;								return;

	WARN_ON_ONCE(!tick_work_cpu);					WARN_ON_ONCE(!tick_work_cpu);

	twork = per_cpu_ptr(tick_work_cpu, cpu);			twork = per_cpu_ptr(tick_work_cpu, cpu);
	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUN		os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUN
	WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);			WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
	if (os == TICK_SCHED_REMOTE_OFFLINE) {				if (os == TICK_SCHED_REMOTE_OFFLINE) {
		twork->cpu = cpu;						twork->cpu = cpu;
		INIT_DELAYED_WORK(&twork->work, sched_tick_re			INIT_DELAYED_WORK(&twork->work, sched_tick_re
		queue_delayed_work(system_unbound_wq, &twork-			queue_delayed_work(system_unbound_wq, &twork-
	}								}
}								}

#ifdef CONFIG_HOTPLUG_CPU					#ifdef CONFIG_HOTPLUG_CPU
static void sched_tick_stop(int cpu)				static void sched_tick_stop(int cpu)
{								{
	struct tick_work *twork;					struct tick_work *twork;
	int os;								int os;

	if (housekeeping_cpu(cpu, HK_TYPE_TICK))			if (housekeeping_cpu(cpu, HK_TYPE_TICK))
		return;								return;

	WARN_ON_ONCE(!tick_work_cpu);					WARN_ON_ONCE(!tick_work_cpu);

	twork = per_cpu_ptr(tick_work_cpu, cpu);			twork = per_cpu_ptr(tick_work_cpu, cpu);
	/* There cannot be competing actions, but don't rely 		/* There cannot be competing actions, but don't rely 
	os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFF		os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFF
	WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);			WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
	/* Don't cancel, as this would mess up the state mach		/* Don't cancel, as this would mess up the state mach
}								}
#endif /* CONFIG_HOTPLUG_CPU */					#endif /* CONFIG_HOTPLUG_CPU */

int __init sched_tick_offload_init(void)			int __init sched_tick_offload_init(void)
{								{
	tick_work_cpu = alloc_percpu(struct tick_work);			tick_work_cpu = alloc_percpu(struct tick_work);
	BUG_ON(!tick_work_cpu);						BUG_ON(!tick_work_cpu);
	return 0;							return 0;
}								}

#else /* !CONFIG_NO_HZ_FULL */					#else /* !CONFIG_NO_HZ_FULL */
static inline void sched_tick_start(int cpu) { }		static inline void sched_tick_start(int cpu) { }
static inline void sched_tick_stop(int cpu) { }			static inline void sched_tick_stop(int cpu) { }
#endif								#endif

#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEM	#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEM
				defined(CONFIG_TRACE_PREEMPT_					defined(CONFIG_TRACE_PREEMPT_
/*								/*
 * If the value passed in is equal to the current preempt cou	 * If the value passed in is equal to the current preempt cou
 * then we just disabled preemption. Start timing the latency	 * then we just disabled preemption. Start timing the latency
 */								 */
static inline void preempt_latency_start(int val)		static inline void preempt_latency_start(int val)
{								{
	if (preempt_count() == val) {					if (preempt_count() == val) {
		unsigned long ip = get_lock_parent_ip();			unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT					#ifdef CONFIG_DEBUG_PREEMPT
		current->preempt_disable_ip = ip;				current->preempt_disable_ip = ip;
#endif								#endif
		trace_preempt_off(CALLER_ADDR0, ip);				trace_preempt_off(CALLER_ADDR0, ip);
	}								}
}								}

void preempt_count_add(int val)					void preempt_count_add(int val)
{								{
#ifdef CONFIG_DEBUG_PREEMPT					#ifdef CONFIG_DEBUG_PREEMPT
	/*								/*
	 * Underflow?							 * Underflow?
	 */								 */
	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))			if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
		return;								return;
#endif								#endif
	__preempt_count_add(val);					__preempt_count_add(val);
#ifdef CONFIG_DEBUG_PREEMPT					#ifdef CONFIG_DEBUG_PREEMPT
	/*								/*
	 * Spinlock count overflowing soon?				 * Spinlock count overflowing soon?
	 */								 */
	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) 		DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) 
				PREEMPT_MASK - 10);						PREEMPT_MASK - 10);
#endif								#endif
	preempt_latency_start(val);					preempt_latency_start(val);
}								}
EXPORT_SYMBOL(preempt_count_add);				EXPORT_SYMBOL(preempt_count_add);
NOKPROBE_SYMBOL(preempt_count_add);				NOKPROBE_SYMBOL(preempt_count_add);

/*								/*
 * If the value passed in equals to the current preempt count	 * If the value passed in equals to the current preempt count
 * then we just enabled preemption. Stop timing the latency.	 * then we just enabled preemption. Stop timing the latency.
 */								 */
static inline void preempt_latency_stop(int val)		static inline void preempt_latency_stop(int val)
{								{
	if (preempt_count() == val)					if (preempt_count() == val)
		trace_preempt_on(CALLER_ADDR0, get_lock_paren			trace_preempt_on(CALLER_ADDR0, get_lock_paren
}								}

void preempt_count_sub(int val)					void preempt_count_sub(int val)
{								{
#ifdef CONFIG_DEBUG_PREEMPT					#ifdef CONFIG_DEBUG_PREEMPT
	/*								/*
	 * Underflow?							 * Underflow?
	 */								 */
	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))			if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
		return;								return;
	/*								/*
	 * Is the spinlock portion underflowing?			 * Is the spinlock portion underflowing?
	 */								 */
	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&			if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
			!(preempt_count() & PREEMPT_MASK)))				!(preempt_count() & PREEMPT_MASK)))
		return;								return;
#endif								#endif

	preempt_latency_stop(val);					preempt_latency_stop(val);
	__preempt_count_sub(val);					__preempt_count_sub(val);
}								}
EXPORT_SYMBOL(preempt_count_sub);				EXPORT_SYMBOL(preempt_count_sub);
NOKPROBE_SYMBOL(preempt_count_sub);				NOKPROBE_SYMBOL(preempt_count_sub);

#else								#else
static inline void preempt_latency_start(int val) { }		static inline void preempt_latency_start(int val) { }
static inline void preempt_latency_stop(int val) { }		static inline void preempt_latency_stop(int val) { }
#endif								#endif

static inline unsigned long get_preempt_disable_ip(struct tas	static inline unsigned long get_preempt_disable_ip(struct tas
{								{
#ifdef CONFIG_DEBUG_PREEMPT					#ifdef CONFIG_DEBUG_PREEMPT
	return p->preempt_disable_ip;					return p->preempt_disable_ip;
#else								#else
	return 0;							return 0;
#endif								#endif
}								}

/*								/*
 * Print scheduling while atomic bug:				 * Print scheduling while atomic bug:
 */								 */
static noinline void __schedule_bug(struct task_struct *prev)	static noinline void __schedule_bug(struct task_struct *prev)
{								{
	/* Save this before calling printk(), since that will		/* Save this before calling printk(), since that will
	unsigned long preempt_disable_ip = get_preempt_disabl		unsigned long preempt_disable_ip = get_preempt_disabl

	if (oops_in_progress)						if (oops_in_progress)
		return;								return;

	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/		printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/
		prev->comm, prev->pid, preempt_count());			prev->comm, prev->pid, preempt_count());

	debug_show_held_locks(prev);					debug_show_held_locks(prev);
	print_modules();						print_modules();
	if (irqs_disabled())						if (irqs_disabled())
		print_irqtrace_events(prev);					print_irqtrace_events(prev);
	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)				if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
	    && in_atomic_preempt_off()) {				    && in_atomic_preempt_off()) {
		pr_err("Preemption disabled at:");				pr_err("Preemption disabled at:");
		print_ip_sym(KERN_ERR, preempt_disable_ip);			print_ip_sym(KERN_ERR, preempt_disable_ip);
	}								}
	check_panic_on_warn("scheduling while atomic");			check_panic_on_warn("scheduling while atomic");

	dump_stack();							dump_stack();
	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);			add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}								}

/*								/*
 * Various schedule()-time debugging checks and statistics:	 * Various schedule()-time debugging checks and statistics:
 */								 */
static inline void schedule_debug(struct task_struct *prev, b	static inline void schedule_debug(struct task_struct *prev, b
{								{
#ifdef CONFIG_SCHED_STACK_END_CHECK				#ifdef CONFIG_SCHED_STACK_END_CHECK
	if (task_stack_end_corrupted(prev))				if (task_stack_end_corrupted(prev))
		panic("corrupted stack end detected inside sc			panic("corrupted stack end detected inside sc

	if (task_scs_end_corrupted(prev))				if (task_scs_end_corrupted(prev))
		panic("corrupted shadow stack detected inside			panic("corrupted shadow stack detected inside
#endif								#endif

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP				#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
	if (!preempt && READ_ONCE(prev->__state) && prev->non		if (!preempt && READ_ONCE(prev->__state) && prev->non
		printk(KERN_ERR "BUG: scheduling in a non-blo			printk(KERN_ERR "BUG: scheduling in a non-blo
			prev->comm, prev->pid, prev->non_bloc				prev->comm, prev->pid, prev->non_bloc
		dump_stack();							dump_stack();
		add_taint(TAINT_WARN, LOCKDEP_STILL_OK);			add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
	}								}
#endif								#endif

	if (unlikely(in_atomic_preempt_off())) {			if (unlikely(in_atomic_preempt_off())) {
		__schedule_bug(prev);						__schedule_bug(prev);
		preempt_count_set(PREEMPT_DISABLED);				preempt_count_set(PREEMPT_DISABLED);
	}								}
	rcu_sleep_check();						rcu_sleep_check();
	SCHED_WARN_ON(ct_state() == CONTEXT_USER);			SCHED_WARN_ON(ct_state() == CONTEXT_USER);

	profile_hit(SCHED_PROFILING, __builtin_return_address		profile_hit(SCHED_PROFILING, __builtin_return_address

	schedstat_inc(this_rq()->sched_count);				schedstat_inc(this_rq()->sched_count);
}								}

static void put_prev_task_balance(struct rq *rq, struct task_	static void put_prev_task_balance(struct rq *rq, struct task_
				  struct rq_flags *rf)						  struct rq_flags *rf)
{								{
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	const struct sched_class *class;				const struct sched_class *class;
	/*								/*
	 * We must do the balancing pass before put_prev_task		 * We must do the balancing pass before put_prev_task
	 * that when we release the rq->lock the task is in t		 * that when we release the rq->lock the task is in t
	 * state as before we took rq->lock.				 * state as before we took rq->lock.
	 *								 *
	 * We can terminate the balance pass as soon as we kn		 * We can terminate the balance pass as soon as we kn
	 * a runnable task of @class priority or higher.		 * a runnable task of @class priority or higher.
	 */								 */
	for_class_range(class, prev->sched_class, &idle_sched		for_class_range(class, prev->sched_class, &idle_sched
		if (class->balance(rq, prev, rf))				if (class->balance(rq, prev, rf))
			break;								break;
	}								}
#endif								#endif

	put_prev_task(rq, prev);					put_prev_task(rq, prev);
}								}

/*								/*
 * Pick up the highest-prio task:				 * Pick up the highest-prio task:
 */								 */
static inline struct task_struct *				static inline struct task_struct *
__pick_next_task(struct rq *rq, struct task_struct *prev, str	__pick_next_task(struct rq *rq, struct task_struct *prev, str
{								{
	const struct sched_class *class;				const struct sched_class *class;
	struct task_struct *p;						struct task_struct *p;

	/*								/*
	 * Optimization: we know that if all tasks are in the		 * Optimization: we know that if all tasks are in the
	 * call that function directly, but only if the @prev		 * call that function directly, but only if the @prev
	 * higher scheduling class, because otherwise those l		 * higher scheduling class, because otherwise those l
	 * opportunity to pull in more work from other CPUs.		 * opportunity to pull in more work from other CPUs.
	 */								 */
	if (likely(!sched_class_above(prev->sched_class, &fai		if (likely(!sched_class_above(prev->sched_class, &fai
		   rq->nr_running == rq->cfs.h_nr_running)) {			   rq->nr_running == rq->cfs.h_nr_running)) {

		p = pick_next_task_fair(rq, prev, rf);				p = pick_next_task_fair(rq, prev, rf);
		if (unlikely(p == RETRY_TASK))					if (unlikely(p == RETRY_TASK))
			goto restart;							goto restart;

		/* Assume the next prioritized class is idle_			/* Assume the next prioritized class is idle_
		if (!p) {							if (!p) {
			put_prev_task(rq, prev);					put_prev_task(rq, prev);
			p = pick_next_task_idle(rq);					p = pick_next_task_idle(rq);
		}								}

		return p;							return p;
	}								}

restart:							restart:
	put_prev_task_balance(rq, prev, rf);				put_prev_task_balance(rq, prev, rf);

	for_each_class(class) {						for_each_class(class) {
		p = class->pick_next_task(rq);					p = class->pick_next_task(rq);
		if (p)								if (p)
			return p;							return p;
	}								}

	BUG(); /* The idle class should always have a runnabl		BUG(); /* The idle class should always have a runnabl
}								}

#ifdef CONFIG_SCHED_CORE					#ifdef CONFIG_SCHED_CORE
static inline bool is_task_rq_idle(struct task_struct *t)	static inline bool is_task_rq_idle(struct task_struct *t)
{								{
	return (task_rq(t)->idle == t);					return (task_rq(t)->idle == t);
}								}

static inline bool cookie_equals(struct task_struct *a, unsig	static inline bool cookie_equals(struct task_struct *a, unsig
{								{
	return is_task_rq_idle(a) || (a->core_cookie == cooki		return is_task_rq_idle(a) || (a->core_cookie == cooki
}								}

static inline bool cookie_match(struct task_struct *a, struct	static inline bool cookie_match(struct task_struct *a, struct
{								{
	if (is_task_rq_idle(a) || is_task_rq_idle(b))			if (is_task_rq_idle(a) || is_task_rq_idle(b))
		return true;							return true;

	return a->core_cookie == b->core_cookie;			return a->core_cookie == b->core_cookie;
}								}

static inline struct task_struct *pick_task(struct rq *rq)	static inline struct task_struct *pick_task(struct rq *rq)
{								{
	const struct sched_class *class;				const struct sched_class *class;
	struct task_struct *p;						struct task_struct *p;

	for_each_class(class) {						for_each_class(class) {
		p = class->pick_task(rq);					p = class->pick_task(rq);
		if (p)								if (p)
			return p;							return p;
	}								}

	BUG(); /* The idle class should always have a runnabl		BUG(); /* The idle class should always have a runnabl
}								}

extern void task_vruntime_update(struct rq *rq, struct task_s	extern void task_vruntime_update(struct rq *rq, struct task_s

static void queue_core_balance(struct rq *rq);			static void queue_core_balance(struct rq *rq);

static struct task_struct *					static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struc	pick_next_task(struct rq *rq, struct task_struct *prev, struc
{								{
	struct task_struct *next, *p, *max = NULL;			struct task_struct *next, *p, *max = NULL;
	const struct cpumask *smt_mask;					const struct cpumask *smt_mask;
	bool fi_before = false;						bool fi_before = false;
	bool core_clock_updated = (rq == rq->core);			bool core_clock_updated = (rq == rq->core);
	unsigned long cookie;						unsigned long cookie;
	int i, cpu, occ = 0;						int i, cpu, occ = 0;
	struct rq *rq_i;						struct rq *rq_i;
	bool need_sync;							bool need_sync;

	if (!sched_core_enabled(rq))					if (!sched_core_enabled(rq))
		return __pick_next_task(rq, prev, rf);				return __pick_next_task(rq, prev, rf);

	cpu = cpu_of(rq);						cpu = cpu_of(rq);

	/* Stopper task is switching into idle, no need core-		/* Stopper task is switching into idle, no need core-
	if (cpu_is_offline(cpu)) {					if (cpu_is_offline(cpu)) {
		/*								/*
		 * Reset core_pick so that we don't enter the			 * Reset core_pick so that we don't enter the
		 * coming online. core_pick would already be 			 * coming online. core_pick would already be 
		 * another cpu during offline.					 * another cpu during offline.
		 */								 */
		rq->core_pick = NULL;						rq->core_pick = NULL;
		return __pick_next_task(rq, prev, rf);				return __pick_next_task(rq, prev, rf);
	}								}

	/*								/*
	 * If there were no {en,de}queues since we picked (IO		 * If there were no {en,de}queues since we picked (IO
	 * pointers are all still valid), and we haven't sche		 * pointers are all still valid), and we haven't sche
	 * pick yet, do so now.						 * pick yet, do so now.
	 *								 *
	 * rq->core_pick can be NULL if no selection was made		 * rq->core_pick can be NULL if no selection was made
	 * it was either offline or went offline during a sib		 * it was either offline or went offline during a sib
	 * selection. In this case, do a core-wide selection.		 * selection. In this case, do a core-wide selection.
	 */								 */
	if (rq->core->core_pick_seq == rq->core->core_task_se		if (rq->core->core_pick_seq == rq->core->core_task_se
	    rq->core->core_pick_seq != rq->core_sched_seq &&		    rq->core->core_pick_seq != rq->core_sched_seq &&
	    rq->core_pick) {						    rq->core_pick) {
		WRITE_ONCE(rq->core_sched_seq, rq->core->core			WRITE_ONCE(rq->core_sched_seq, rq->core->core

		next = rq->core_pick;						next = rq->core_pick;
		if (next != prev) {						if (next != prev) {
			put_prev_task(rq, prev);					put_prev_task(rq, prev);
			set_next_task(rq, next);					set_next_task(rq, next);
		}								}

		rq->core_pick = NULL;						rq->core_pick = NULL;
		goto out;							goto out;
	}								}

	put_prev_task_balance(rq, prev, rf);				put_prev_task_balance(rq, prev, rf);

	smt_mask = cpu_smt_mask(cpu);					smt_mask = cpu_smt_mask(cpu);
	need_sync = !!rq->core->core_cookie;				need_sync = !!rq->core->core_cookie;

	/* reset state */						/* reset state */
	rq->core->core_cookie = 0UL;					rq->core->core_cookie = 0UL;
	if (rq->core->core_forceidle_count) {				if (rq->core->core_forceidle_count) {
		if (!core_clock_updated) {					if (!core_clock_updated) {
			update_rq_clock(rq->core);					update_rq_clock(rq->core);
			core_clock_updated = true;					core_clock_updated = true;
		}								}
		sched_core_account_forceidle(rq);				sched_core_account_forceidle(rq);
		/* reset after accounting force idle */				/* reset after accounting force idle */
		rq->core->core_forceidle_start = 0;				rq->core->core_forceidle_start = 0;
		rq->core->core_forceidle_count = 0;				rq->core->core_forceidle_count = 0;
		rq->core->core_forceidle_occupation = 0;			rq->core->core_forceidle_occupation = 0;
		need_sync = true;						need_sync = true;
		fi_before = true;						fi_before = true;
	}								}

	/*								/*
	 * core->core_task_seq, core->core_pick_seq, rq->core		 * core->core_task_seq, core->core_pick_seq, rq->core
	 *								 *
	 * @task_seq guards the task state ({en,de}queues)		 * @task_seq guards the task state ({en,de}queues)
	 * @pick_seq is the @task_seq we did a selection on		 * @pick_seq is the @task_seq we did a selection on
	 * @sched_seq is the @pick_seq we scheduled			 * @sched_seq is the @pick_seq we scheduled
	 *								 *
	 * However, preemptions can cause multiple picks on t		 * However, preemptions can cause multiple picks on t
	 * 'Fix' this by also increasing @task_seq for every 		 * 'Fix' this by also increasing @task_seq for every 
	 */								 */
	rq->core->core_task_seq++;					rq->core->core_task_seq++;

	/*								/*
	 * Optimize for common case where this CPU has no coo		 * Optimize for common case where this CPU has no coo
	 * and there are no cookied tasks running on siblings		 * and there are no cookied tasks running on siblings
	 */								 */
	if (!need_sync) {						if (!need_sync) {
		next = pick_task(rq);						next = pick_task(rq);
		if (!next->core_cookie) {					if (!next->core_cookie) {
			rq->core_pick = NULL;						rq->core_pick = NULL;
			/*								/*
			 * For robustness, update the min_vru				 * For robustness, update the min_vru
			 * unconstrained picks as well.					 * unconstrained picks as well.
			 */								 */
			WARN_ON_ONCE(fi_before);					WARN_ON_ONCE(fi_before);
			task_vruntime_update(rq, next, false)				task_vruntime_update(rq, next, false)
			goto out_set_next;						goto out_set_next;
		}								}
	}								}

	/*								/*
	 * For each thread: do the regular task pick and find		 * For each thread: do the regular task pick and find
	 * amongst them.						 * amongst them.
	 *								 *
	 * Tie-break prio towards the current CPU			 * Tie-break prio towards the current CPU
	 */								 */
	for_each_cpu_wrap(i, smt_mask, cpu) {				for_each_cpu_wrap(i, smt_mask, cpu) {
		rq_i = cpu_rq(i);						rq_i = cpu_rq(i);

		/*								/*
		 * Current cpu always has its clock updated o			 * Current cpu always has its clock updated o
		 * pick_next_task(). If the current cpu is no			 * pick_next_task(). If the current cpu is no
		 * the core may also have been updated above.			 * the core may also have been updated above.
		 */								 */
		if (i != cpu && (rq_i != rq->core || !core_cl			if (i != cpu && (rq_i != rq->core || !core_cl
			update_rq_clock(rq_i);						update_rq_clock(rq_i);

		p = rq_i->core_pick = pick_task(rq_i);				p = rq_i->core_pick = pick_task(rq_i);
		if (!max || prio_less(max, p, fi_before))			if (!max || prio_less(max, p, fi_before))
			max = p;							max = p;
	}								}

	cookie = rq->core->core_cookie = max->core_cookie;		cookie = rq->core->core_cookie = max->core_cookie;

	/*								/*
	 * For each thread: try and find a runnable task that		 * For each thread: try and find a runnable task that
	 * force idle.							 * force idle.
	 */								 */
	for_each_cpu(i, smt_mask) {					for_each_cpu(i, smt_mask) {
		rq_i = cpu_rq(i);						rq_i = cpu_rq(i);
		p = rq_i->core_pick;						p = rq_i->core_pick;

		if (!cookie_equals(p, cookie)) {				if (!cookie_equals(p, cookie)) {
			p = NULL;							p = NULL;
			if (cookie)							if (cookie)
				p = sched_core_find(rq_i, coo					p = sched_core_find(rq_i, coo
			if (!p)								if (!p)
				p = idle_sched_class.pick_tas					p = idle_sched_class.pick_tas
		}								}

		rq_i->core_pick = p;						rq_i->core_pick = p;

		if (p == rq_i->idle) {						if (p == rq_i->idle) {
			if (rq_i->nr_running) {						if (rq_i->nr_running) {
				rq->core->core_forceidle_coun					rq->core->core_forceidle_coun
				if (!fi_before)							if (!fi_before)
					rq->core->core_forcei						rq->core->core_forcei
			}								}
		} else {							} else {
			occ++;								occ++;
		}								}
	}								}

	if (schedstat_enabled() && rq->core->core_forceidle_c		if (schedstat_enabled() && rq->core->core_forceidle_c
		rq->core->core_forceidle_start = rq_clock(rq-			rq->core->core_forceidle_start = rq_clock(rq-
		rq->core->core_forceidle_occupation = occ;			rq->core->core_forceidle_occupation = occ;
	}								}

	rq->core->core_pick_seq = rq->core->core_task_seq;		rq->core->core_pick_seq = rq->core->core_task_seq;
	next = rq->core_pick;						next = rq->core_pick;
	rq->core_sched_seq = rq->core->core_pick_seq;			rq->core_sched_seq = rq->core->core_pick_seq;

	/* Something should have been selected for current CP		/* Something should have been selected for current CP
	WARN_ON_ONCE(!next);						WARN_ON_ONCE(!next);

	/*								/*
	 * Reschedule siblings						 * Reschedule siblings
	 *								 *
	 * NOTE: L1TF -- at this point we're no longer runnin		 * NOTE: L1TF -- at this point we're no longer runnin
	 * sending an IPI (below) ensures the sibling will no		 * sending an IPI (below) ensures the sibling will no
	 * their task. This ensures there is no inter-sibling		 * their task. This ensures there is no inter-sibling
	 * non-matching user state.					 * non-matching user state.
	 */								 */
	for_each_cpu(i, smt_mask) {					for_each_cpu(i, smt_mask) {
		rq_i = cpu_rq(i);						rq_i = cpu_rq(i);

		/*								/*
		 * An online sibling might have gone offline 			 * An online sibling might have gone offline 
		 * could be picked for it, or it might be off			 * could be picked for it, or it might be off
		 * happen to come online, but its too late an			 * happen to come online, but its too late an
		 * picked for it.  That's Ok - it will pick t			 * picked for it.  That's Ok - it will pick t
		 * so ignore it.						 * so ignore it.
		 */								 */
		if (!rq_i->core_pick)						if (!rq_i->core_pick)
			continue;							continue;

		/*								/*
		 * Update for new !FI->FI transitions, or if 			 * Update for new !FI->FI transitions, or if 
		 * fi_before     fi      update?				 * fi_before     fi      update?
		 *  0            0       1					 *  0            0       1
		 *  0            1       1					 *  0            1       1
		 *  1            0       1					 *  1            0       1
		 *  1            1       0					 *  1            1       0
		 */								 */
		if (!(fi_before && rq->core->core_forceidle_c			if (!(fi_before && rq->core->core_forceidle_c
			task_vruntime_update(rq_i, rq_i->core				task_vruntime_update(rq_i, rq_i->core

		rq_i->core_pick->core_occupation = occ;				rq_i->core_pick->core_occupation = occ;

		if (i == cpu) {							if (i == cpu) {
			rq_i->core_pick = NULL;						rq_i->core_pick = NULL;
			continue;							continue;
		}								}

		/* Did we break L1TF mitigation requirements?			/* Did we break L1TF mitigation requirements?
		WARN_ON_ONCE(!cookie_match(next, rq_i->core_p			WARN_ON_ONCE(!cookie_match(next, rq_i->core_p

		if (rq_i->curr == rq_i->core_pick) {				if (rq_i->curr == rq_i->core_pick) {
			rq_i->core_pick = NULL;						rq_i->core_pick = NULL;
			continue;							continue;
		}								}

		resched_curr(rq_i);						resched_curr(rq_i);
	}								}

out_set_next:							out_set_next:
	set_next_task(rq, next);					set_next_task(rq, next);
out:								out:
	if (rq->core->core_forceidle_count && next == rq->idl		if (rq->core->core_forceidle_count && next == rq->idl
		queue_core_balance(rq);						queue_core_balance(rq);

	return next;							return next;
}								}

static bool try_steal_cookie(int this, int that)		static bool try_steal_cookie(int this, int that)
{								{
	struct rq *dst = cpu_rq(this), *src = cpu_rq(that);		struct rq *dst = cpu_rq(this), *src = cpu_rq(that);
	struct task_struct *p;						struct task_struct *p;
	unsigned long cookie;						unsigned long cookie;
	bool success = false;						bool success = false;

	guard(irq)();							guard(irq)();
	guard(double_rq_lock)(dst, src);				guard(double_rq_lock)(dst, src);

	cookie = dst->core->core_cookie;				cookie = dst->core->core_cookie;
	if (!cookie)							if (!cookie)
		return false;							return false;

	if (dst->curr != dst->idle)					if (dst->curr != dst->idle)
		return false;							return false;

	p = sched_core_find(src, cookie);				p = sched_core_find(src, cookie);
	if (!p)								if (!p)
		return false;							return false;

	do {								do {
		if (p == src->core_pick || p == src->curr)			if (p == src->core_pick || p == src->curr)
			goto next;							goto next;

		if (!is_cpu_allowed(p, this))					if (!is_cpu_allowed(p, this))
			goto next;							goto next;

		if (p->core_occupation > dst->idle->core_occu			if (p->core_occupation > dst->idle->core_occu
			goto next;							goto next;
		/*								/*
		 * sched_core_find() and sched_core_next() wi			 * sched_core_find() and sched_core_next() wi
		 * that task @p is not throttled now, we also			 * that task @p is not throttled now, we also
		 * check whether the runqueue of the destinat			 * check whether the runqueue of the destinat
		 * being throttled.						 * being throttled.
		 */								 */
		if (sched_task_is_throttled(p, this))				if (sched_task_is_throttled(p, this))
			goto next;							goto next;

		deactivate_task(src, p, 0);					deactivate_task(src, p, 0);
		set_task_cpu(p, this);						set_task_cpu(p, this);
		activate_task(dst, p, 0);					activate_task(dst, p, 0);

		resched_curr(dst);						resched_curr(dst);

		success = true;							success = true;
		break;								break;

next:								next:
		p = sched_core_next(p, cookie);					p = sched_core_next(p, cookie);
	} while (p);							} while (p);

	return success;							return success;
}								}

static bool steal_cookie_task(int cpu, struct sched_domain *s	static bool steal_cookie_task(int cpu, struct sched_domain *s
{								{
	int i;								int i;

	for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) 		for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) 
		if (i == cpu)							if (i == cpu)
			continue;							continue;

		if (need_resched())						if (need_resched())
			break;								break;

		if (try_steal_cookie(cpu, i))					if (try_steal_cookie(cpu, i))
			return true;							return true;
	}								}

	return false;							return false;
}								}

static void sched_core_balance(struct rq *rq)			static void sched_core_balance(struct rq *rq)
{								{
	struct sched_domain *sd;					struct sched_domain *sd;
	int cpu = cpu_of(rq);						int cpu = cpu_of(rq);

	preempt_disable();						preempt_disable();
	rcu_read_lock();						rcu_read_lock();
	raw_spin_rq_unlock_irq(rq);					raw_spin_rq_unlock_irq(rq);
	for_each_domain(cpu, sd) {					for_each_domain(cpu, sd) {
		if (need_resched())						if (need_resched())
			break;								break;

		if (steal_cookie_task(cpu, sd))					if (steal_cookie_task(cpu, sd))
			break;								break;
	}								}
	raw_spin_rq_lock_irq(rq);					raw_spin_rq_lock_irq(rq);
	rcu_read_unlock();						rcu_read_unlock();
	preempt_enable();						preempt_enable();
}								}

static DEFINE_PER_CPU(struct balance_callback, core_balance_h	static DEFINE_PER_CPU(struct balance_callback, core_balance_h

static void queue_core_balance(struct rq *rq)			static void queue_core_balance(struct rq *rq)
{								{
	if (!sched_core_enabled(rq))					if (!sched_core_enabled(rq))
		return;								return;

	if (!rq->core->core_cookie)					if (!rq->core->core_cookie)
		return;								return;

	if (!rq->nr_running) /* not forced idle */			if (!rq->nr_running) /* not forced idle */
		return;								return;

	queue_balance_callback(rq, &per_cpu(core_balance_head		queue_balance_callback(rq, &per_cpu(core_balance_head
}								}

DEFINE_LOCK_GUARD_1(core_lock, int,				DEFINE_LOCK_GUARD_1(core_lock, int,
		    sched_core_lock(*_T->lock, &_T->flags),			    sched_core_lock(*_T->lock, &_T->flags),
		    sched_core_unlock(*_T->lock, &_T->flags),			    sched_core_unlock(*_T->lock, &_T->flags),
		    unsigned long flags)					    unsigned long flags)

static void sched_core_cpu_starting(unsigned int cpu)		static void sched_core_cpu_starting(unsigned int cpu)
{								{
	const struct cpumask *smt_mask = cpu_smt_mask(cpu);		const struct cpumask *smt_mask = cpu_smt_mask(cpu);
	struct rq *rq = cpu_rq(cpu), *core_rq = NULL;			struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
	int t;								int t;

	guard(core_lock)(&cpu);						guard(core_lock)(&cpu);

	WARN_ON_ONCE(rq->core != rq);					WARN_ON_ONCE(rq->core != rq);

	/* if we're the first, we'll be our own leader */		/* if we're the first, we'll be our own leader */
	if (cpumask_weight(smt_mask) == 1)				if (cpumask_weight(smt_mask) == 1)
		return;								return;

	/* find the leader */						/* find the leader */
	for_each_cpu(t, smt_mask) {					for_each_cpu(t, smt_mask) {
		if (t == cpu)							if (t == cpu)
			continue;							continue;
		rq = cpu_rq(t);							rq = cpu_rq(t);
		if (rq->core == rq) {						if (rq->core == rq) {
			core_rq = rq;							core_rq = rq;
			break;								break;
		}								}
	}								}

	if (WARN_ON_ONCE(!core_rq)) /* whoopsie */			if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
		return;								return;

	/* install and validate core_rq */				/* install and validate core_rq */
	for_each_cpu(t, smt_mask) {					for_each_cpu(t, smt_mask) {
		rq = cpu_rq(t);							rq = cpu_rq(t);

		if (t == cpu)							if (t == cpu)
			rq->core = core_rq;						rq->core = core_rq;

		WARN_ON_ONCE(rq->core != core_rq);				WARN_ON_ONCE(rq->core != core_rq);
	}								}
}								}

static void sched_core_cpu_deactivate(unsigned int cpu)		static void sched_core_cpu_deactivate(unsigned int cpu)
{								{
	const struct cpumask *smt_mask = cpu_smt_mask(cpu);		const struct cpumask *smt_mask = cpu_smt_mask(cpu);
	struct rq *rq = cpu_rq(cpu), *core_rq = NULL;			struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
	int t;								int t;

	guard(core_lock)(&cpu);						guard(core_lock)(&cpu);

	/* if we're the last man standing, nothing to do */		/* if we're the last man standing, nothing to do */
	if (cpumask_weight(smt_mask) == 1) {				if (cpumask_weight(smt_mask) == 1) {
		WARN_ON_ONCE(rq->core != rq);					WARN_ON_ONCE(rq->core != rq);
		return;								return;
	}								}

	/* if we're not the leader, nothing to do */			/* if we're not the leader, nothing to do */
	if (rq->core != rq)						if (rq->core != rq)
		return;								return;

	/* find a new leader */						/* find a new leader */
	for_each_cpu(t, smt_mask) {					for_each_cpu(t, smt_mask) {
		if (t == cpu)							if (t == cpu)
			continue;							continue;
		core_rq = cpu_rq(t);						core_rq = cpu_rq(t);
		break;								break;
	}								}

	if (WARN_ON_ONCE(!core_rq)) /* impossible */			if (WARN_ON_ONCE(!core_rq)) /* impossible */
		return;								return;

	/* copy the shared state to the new leader */			/* copy the shared state to the new leader */
	core_rq->core_task_seq             = rq->core_task_se		core_rq->core_task_seq             = rq->core_task_se
	core_rq->core_pick_seq             = rq->core_pick_se		core_rq->core_pick_seq             = rq->core_pick_se
	core_rq->core_cookie               = rq->core_cookie;		core_rq->core_cookie               = rq->core_cookie;
	core_rq->core_forceidle_count      = rq->core_forceid		core_rq->core_forceidle_count      = rq->core_forceid
	core_rq->core_forceidle_seq        = rq->core_forceid		core_rq->core_forceidle_seq        = rq->core_forceid
	core_rq->core_forceidle_occupation = rq->core_forceid		core_rq->core_forceidle_occupation = rq->core_forceid

	/*								/*
	 * Accounting edge for forced idle is handled in pick		 * Accounting edge for forced idle is handled in pick
	 * Don't need another one here, since the hotplug thr		 * Don't need another one here, since the hotplug thr
	 * have a cookie.						 * have a cookie.
	 */								 */
	core_rq->core_forceidle_start = 0;				core_rq->core_forceidle_start = 0;

	/* install new leader */					/* install new leader */
	for_each_cpu(t, smt_mask) {					for_each_cpu(t, smt_mask) {
		rq = cpu_rq(t);							rq = cpu_rq(t);
		rq->core = core_rq;						rq->core = core_rq;
	}								}
}								}

static inline void sched_core_cpu_dying(unsigned int cpu)	static inline void sched_core_cpu_dying(unsigned int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	if (rq->core != rq)						if (rq->core != rq)
		rq->core = rq;							rq->core = rq;
}								}

#else /* !CONFIG_SCHED_CORE */					#else /* !CONFIG_SCHED_CORE */

static inline void sched_core_cpu_starting(unsigned int cpu) 	static inline void sched_core_cpu_starting(unsigned int cpu) 
static inline void sched_core_cpu_deactivate(unsigned int cpu	static inline void sched_core_cpu_deactivate(unsigned int cpu
static inline void sched_core_cpu_dying(unsigned int cpu) {}	static inline void sched_core_cpu_dying(unsigned int cpu) {}

static struct task_struct *					static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struc	pick_next_task(struct rq *rq, struct task_struct *prev, struc
{								{
	return __pick_next_task(rq, prev, rf);				return __pick_next_task(rq, prev, rf);
}								}

#endif /* CONFIG_SCHED_CORE */					#endif /* CONFIG_SCHED_CORE */

/*								/*
 * Constants for the sched_mode argument of __schedule().	 * Constants for the sched_mode argument of __schedule().
 *								 *
 * The mode argument allows RT enabled kernels to differentia	 * The mode argument allows RT enabled kernels to differentia
 * preemption from blocking on an 'sleeping' spin/rwlock. Not	 * preemption from blocking on an 'sleeping' spin/rwlock. Not
 * SM_MASK_PREEMPT for !RT has all bits set, which allows the	 * SM_MASK_PREEMPT for !RT has all bits set, which allows the
 * optimize the AND operation out and just check for zero.	 * optimize the AND operation out and just check for zero.
 */								 */
#define SM_NONE			0x0				#define SM_NONE			0x0
#define SM_PREEMPT		0x1				#define SM_PREEMPT		0x1
#define SM_RTLOCK_WAIT		0x2				#define SM_RTLOCK_WAIT		0x2

#ifndef CONFIG_PREEMPT_RT					#ifndef CONFIG_PREEMPT_RT
# define SM_MASK_PREEMPT	(~0U)				# define SM_MASK_PREEMPT	(~0U)
#else								#else
# define SM_MASK_PREEMPT	SM_PREEMPT			# define SM_MASK_PREEMPT	SM_PREEMPT
#endif								#endif

/*								/*
 * __schedule() is the main scheduler function.			 * __schedule() is the main scheduler function.
 *								 *
 * The main means of driving the scheduler and thus entering 	 * The main means of driving the scheduler and thus entering 
 *								 *
 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.	 *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
 *								 *
 *   2. TIF_NEED_RESCHED flag is checked on interrupt and use	 *   2. TIF_NEED_RESCHED flag is checked on interrupt and use
 *      paths. For example, see arch/x86/entry_64.S.		 *      paths. For example, see arch/x86/entry_64.S.
 *								 *
 *      To drive preemption between tasks, the scheduler sets	 *      To drive preemption between tasks, the scheduler sets
 *      interrupt handler scheduler_tick().			 *      interrupt handler scheduler_tick().
 *								 *
 *   3. Wakeups don't really cause entry into schedule(). The	 *   3. Wakeups don't really cause entry into schedule(). The
 *      task to the run-queue and that's it.			 *      task to the run-queue and that's it.
 *								 *
 *      Now, if the new task added to the run-queue preempts 	 *      Now, if the new task added to the run-queue preempts 
 *      task, then the wakeup sets TIF_NEED_RESCHED and sched	 *      task, then the wakeup sets TIF_NEED_RESCHED and sched
 *      called on the nearest possible occasion:		 *      called on the nearest possible occasion:
 *								 *
 *       - If the kernel is preemptible (CONFIG_PREEMPTION=y)	 *       - If the kernel is preemptible (CONFIG_PREEMPTION=y)
 *								 *
 *         - in syscall or exception context, at the next out	 *         - in syscall or exception context, at the next out
 *           preempt_enable(). (this might be as soon as the 	 *           preempt_enable(). (this might be as soon as the 
 *           spin_unlock()!)					 *           spin_unlock()!)
 *								 *
 *         - in IRQ context, return from interrupt-handler to	 *         - in IRQ context, return from interrupt-handler to
 *           preemptible context				 *           preemptible context
 *								 *
 *       - If the kernel is not preemptible (CONFIG_PREEMPTIO	 *       - If the kernel is not preemptible (CONFIG_PREEMPTIO
 *         then at the next:					 *         then at the next:
 *								 *
 *          - cond_resched() call				 *          - cond_resched() call
 *          - explicit schedule() call				 *          - explicit schedule() call
 *          - return from syscall or exception to user-space	 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space	 *          - return from interrupt-handler to user-space
 *								 *
 * WARNING: must be called with preemption disabled!		 * WARNING: must be called with preemption disabled!
 */								 */
static void __sched notrace __schedule(unsigned int sched_mod	static void __sched notrace __schedule(unsigned int sched_mod
{								{
	struct task_struct *prev, *next;				struct task_struct *prev, *next;
	unsigned long *switch_count;					unsigned long *switch_count;
	unsigned long prev_state;					unsigned long prev_state;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;
	int cpu;							int cpu;

	cpu = smp_processor_id();					cpu = smp_processor_id();
	rq = cpu_rq(cpu);						rq = cpu_rq(cpu);
	prev = rq->curr;						prev = rq->curr;

	schedule_debug(prev, !!sched_mode);				schedule_debug(prev, !!sched_mode);

	if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))		if (sched_feat(HRTICK) || sched_feat(HRTICK_DL))
		hrtick_clear(rq);						hrtick_clear(rq);

	local_irq_disable();						local_irq_disable();
	rcu_note_context_switch(!!sched_mode);				rcu_note_context_switch(!!sched_mode);

	/*								/*
	 * Make sure that signal_pending_state()->signal_pend		 * Make sure that signal_pending_state()->signal_pend
	 * can't be reordered with __set_current_state(TASK_I		 * can't be reordered with __set_current_state(TASK_I
	 * done by the caller to avoid the race with signal_w		 * done by the caller to avoid the race with signal_w
	 *								 *
	 * __set_current_state(@state)		signal_wake_u		 * __set_current_state(@state)		signal_wake_u
	 * schedule()				  set_tsk_thr		 * schedule()				  set_tsk_thr
	 *					  wake_up_sta		 *					  wake_up_sta
	 *   LOCK rq->lock			    LOCK p->p		 *   LOCK rq->lock			    LOCK p->p
	 *   smp_mb__after_spinlock()		    smp_mb__a		 *   smp_mb__after_spinlock()		    smp_mb__a
	 *     if (signal_pending_state())	    if (p->st		 *     if (signal_pending_state())	    if (p->st
	 *								 *
	 * Also, the membarrier system call requires a full m		 * Also, the membarrier system call requires a full m
	 * after coming from user-space, before storing to rq		 * after coming from user-space, before storing to rq
	 */								 */
	rq_lock(rq, &rf);						rq_lock(rq, &rf);
	smp_mb__after_spinlock();					smp_mb__after_spinlock();

	/* Promote REQ to ACT */					/* Promote REQ to ACT */
	rq->clock_update_flags <<= 1;					rq->clock_update_flags <<= 1;
	update_rq_clock(rq);						update_rq_clock(rq);
	rq->clock_update_flags = RQCF_UPDATED;				rq->clock_update_flags = RQCF_UPDATED;

	switch_count = &prev->nivcsw;					switch_count = &prev->nivcsw;

	/*								/*
	 * We must load prev->state once (task_struct::state 		 * We must load prev->state once (task_struct::state 
	 * that we form a control dependency vs deactivate_ta		 * that we form a control dependency vs deactivate_ta
	 */								 */
	prev_state = READ_ONCE(prev->__state);				prev_state = READ_ONCE(prev->__state);
	if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {		if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) {
		if (signal_pending_state(prev_state, prev)) {			if (signal_pending_state(prev_state, prev)) {
			WRITE_ONCE(prev->__state, TASK_RUNNIN				WRITE_ONCE(prev->__state, TASK_RUNNIN
		} else {							} else {
			prev->sched_contributes_to_load =				prev->sched_contributes_to_load =
				(prev_state & TASK_UNINTERRUP					(prev_state & TASK_UNINTERRUP
				!(prev_state & TASK_NOLOAD) &					!(prev_state & TASK_NOLOAD) &
				!(prev_state & TASK_FROZEN);					!(prev_state & TASK_FROZEN);

			if (prev->sched_contributes_to_load)				if (prev->sched_contributes_to_load)
				rq->nr_uninterruptible++;					rq->nr_uninterruptible++;

			/*								/*
			 * __schedule()			ttwu(				 * __schedule()			ttwu(
			 *   prev_state = prev->state;    if 				 *   prev_state = prev->state;    if 
			 *   if (prev_state)		    g				 *   if (prev_state)		    g
			 *     p->on_rq = 0;		  smp				 *     p->on_rq = 0;		  smp
			 *				  p->				 *				  p->
			 *								 *
			 * Where __schedule() and ttwu() have				 * Where __schedule() and ttwu() have
			 *								 *
			 * After this, schedule() must not ca				 * After this, schedule() must not ca
			 */								 */
			deactivate_task(rq, prev, DEQUEUE_SLE				deactivate_task(rq, prev, DEQUEUE_SLE

			if (prev->in_iowait) {						if (prev->in_iowait) {
				atomic_inc(&rq->nr_iowait);					atomic_inc(&rq->nr_iowait);
				delayacct_blkio_start();					delayacct_blkio_start();
			}								}
		}								}
		switch_count = &prev->nvcsw;					switch_count = &prev->nvcsw;
	}								}

	next = pick_next_task(rq, prev, &rf);				next = pick_next_task(rq, prev, &rf);
	clear_tsk_need_resched(prev);					clear_tsk_need_resched(prev);
	clear_preempt_need_resched();					clear_preempt_need_resched();
#ifdef CONFIG_SCHED_DEBUG					#ifdef CONFIG_SCHED_DEBUG
	rq->last_seen_need_resched_ns = 0;				rq->last_seen_need_resched_ns = 0;
#endif								#endif

	if (likely(prev != next)) {					if (likely(prev != next)) {
		rq->nr_switches++;						rq->nr_switches++;
		/*								/*
		 * RCU users of rcu_dereference(rq->curr) may			 * RCU users of rcu_dereference(rq->curr) may
		 * changes to task_struct made by pick_next_t			 * changes to task_struct made by pick_next_t
		 */								 */
		RCU_INIT_POINTER(rq->curr, next);				RCU_INIT_POINTER(rq->curr, next);
		/*								/*
		 * The membarrier system call requires each a			 * The membarrier system call requires each a
		 * to have a full memory barrier after updati			 * to have a full memory barrier after updati
		 * rq->curr, before returning to user-space.			 * rq->curr, before returning to user-space.
		 *								 *
		 * Here are the schemes providing that barrie			 * Here are the schemes providing that barrie
		 * various architectures:					 * various architectures:
		 * - mm ? switch_mm() : mmdrop() for x86, s39			 * - mm ? switch_mm() : mmdrop() for x86, s39
		 *   switch_mm() rely on membarrier_arch_swit			 *   switch_mm() rely on membarrier_arch_swit
		 * - finish_lock_switch() for weakly-ordered			 * - finish_lock_switch() for weakly-ordered
		 *   architectures where spin_unlock is a ful			 *   architectures where spin_unlock is a ful
		 * - switch_to() for arm64 (weakly-ordered, s			 * - switch_to() for arm64 (weakly-ordered, s
		 *   is a RELEASE barrier),					 *   is a RELEASE barrier),
		 */								 */
		++*switch_count;						++*switch_count;

		migrate_disable_switch(rq, prev);				migrate_disable_switch(rq, prev);
		psi_sched_switch(prev, next, !task_on_rq_queu			psi_sched_switch(prev, next, !task_on_rq_queu

		trace_sched_switch(sched_mode & SM_MASK_PREEM			trace_sched_switch(sched_mode & SM_MASK_PREEM

		/* Also unlocks the rq: */					/* Also unlocks the rq: */
		rq = context_switch(rq, prev, next, &rf);			rq = context_switch(rq, prev, next, &rf);
	} else {							} else {
		rq_unpin_lock(rq, &rf);						rq_unpin_lock(rq, &rf);
		__balance_callbacks(rq);					__balance_callbacks(rq);
		raw_spin_rq_unlock_irq(rq);					raw_spin_rq_unlock_irq(rq);
	}								}
}								}

void __noreturn do_task_dead(void)				void __noreturn do_task_dead(void)
{								{
	/* Causes final put_task_struct in finish_task_switch		/* Causes final put_task_struct in finish_task_switch
	set_special_state(TASK_DEAD);					set_special_state(TASK_DEAD);

	/* Tell freezer to ignore us: */				/* Tell freezer to ignore us: */
	current->flags |= PF_NOFREEZE;					current->flags |= PF_NOFREEZE;

	__schedule(SM_NONE);						__schedule(SM_NONE);
	BUG();								BUG();

	/* Avoid "noreturn function does return" - but don't 		/* Avoid "noreturn function does return" - but don't 
	for (;;)							for (;;)
		cpu_relax();							cpu_relax();
}								}

static inline void sched_submit_work(struct task_struct *tsk)	static inline void sched_submit_work(struct task_struct *tsk)
{								{
	static DEFINE_WAIT_OVERRIDE_MAP(sched_map, LD_WAIT_CO <
	unsigned int task_flags;					unsigned int task_flags;

	/*						      |		if (task_is_running(tsk))
	 * Establish LD_WAIT_CONFIG context to ensure none of |			return;
	 * will use a blocking primitive -- which would lead  <
	 */						      <
	lock_map_acquire_try(&sched_map);		      <

	task_flags = tsk->flags;					task_flags = tsk->flags;
	/*								/*
	 * If a worker goes to sleep, notify and ask workqueu		 * If a worker goes to sleep, notify and ask workqueu
	 * wants to wake up a task to maintain concurrency.		 * wants to wake up a task to maintain concurrency.
	 */								 */
	if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {		if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
		if (task_flags & PF_WQ_WORKER)					if (task_flags & PF_WQ_WORKER)
			wq_worker_sleeping(tsk);					wq_worker_sleeping(tsk);
		else								else
			io_wq_worker_sleeping(tsk);					io_wq_worker_sleeping(tsk);
	}								}

	/*								/*
	 * spinlock and rwlock must not flush block requests.		 * spinlock and rwlock must not flush block requests.
	 * deadlock if the callback attempts to acquire a loc		 * deadlock if the callback attempts to acquire a loc
	 * already acquired.						 * already acquired.
	 */								 */
	SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);		SCHED_WARN_ON(current->__state & TASK_RTLOCK_WAIT);

	/*								/*
	 * If we are going to sleep and we have plugged IO qu		 * If we are going to sleep and we have plugged IO qu
	 * make sure to submit it to avoid deadlocks.			 * make sure to submit it to avoid deadlocks.
	 */								 */
	blk_flush_plug(tsk->plug, true);				blk_flush_plug(tsk->plug, true);
							      <
	lock_map_release(&sched_map);			      <
}								}

static void sched_update_worker(struct task_struct *tsk)	static void sched_update_worker(struct task_struct *tsk)
{								{
	if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {		if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
		if (tsk->flags & PF_WQ_WORKER)					if (tsk->flags & PF_WQ_WORKER)
			wq_worker_running(tsk);						wq_worker_running(tsk);
		else								else
			io_wq_worker_running(tsk);					io_wq_worker_running(tsk);
	}								}
}								}

static __always_inline void __schedule_loop(unsigned int sche |	asmlinkage __visible void __sched schedule(void)
{								{
							      >		struct task_struct *tsk = current;
							      >
							      >		sched_submit_work(tsk);
	do {								do {
		preempt_disable();						preempt_disable();
		__schedule(sched_mode);			      |			__schedule(SM_NONE);
		sched_preempt_enable_no_resched();				sched_preempt_enable_no_resched();
	} while (need_resched());					} while (need_resched());
}							      <
							      <
asmlinkage __visible void __sched schedule(void)	      <
{							      <
	struct task_struct *tsk = current;		      <
							      <
#ifdef CONFIG_RT_MUTEXES				      <
	lockdep_assert(!tsk->sched_rt_mutex);		      <
#endif							      <
							      <
	if (!task_is_running(tsk))			      <
		sched_submit_work(tsk);			      <
	__schedule_loop(SM_NONE);			      <
	sched_update_worker(tsk);					sched_update_worker(tsk);
}								}
EXPORT_SYMBOL(schedule);					EXPORT_SYMBOL(schedule);

/*								/*
 * synchronize_rcu_tasks() makes sure that no task is stuck i	 * synchronize_rcu_tasks() makes sure that no task is stuck i
 * state (have scheduled out non-voluntarily) by making sure 	 * state (have scheduled out non-voluntarily) by making sure 
 * tasks have either left the run queue or have gone into use	 * tasks have either left the run queue or have gone into use
 * As idle tasks do not do either, they must not ever be pree	 * As idle tasks do not do either, they must not ever be pree
 * (schedule out non-voluntarily).				 * (schedule out non-voluntarily).
 *								 *
 * schedule_idle() is similar to schedule_preempt_disable() e	 * schedule_idle() is similar to schedule_preempt_disable() e
 * never enables preemption because it does not call sched_su	 * never enables preemption because it does not call sched_su
 */								 */
void __sched schedule_idle(void)				void __sched schedule_idle(void)
{								{
	/*								/*
	 * As this skips calling sched_submit_work(), which t		 * As this skips calling sched_submit_work(), which t
	 * regardless because that function is a nop when the		 * regardless because that function is a nop when the
	 * TASK_RUNNING state, make sure this isn't used some		 * TASK_RUNNING state, make sure this isn't used some
	 * current task can be in any other state. Note, idle		 * current task can be in any other state. Note, idle
	 * TASK_RUNNING state.						 * TASK_RUNNING state.
	 */								 */
	WARN_ON_ONCE(current->__state);					WARN_ON_ONCE(current->__state);
	do {								do {
		__schedule(SM_NONE);						__schedule(SM_NONE);
	} while (need_resched());					} while (need_resched());
}								}

#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_	#if defined(CONFIG_CONTEXT_TRACKING_USER) && !defined(CONFIG_
asmlinkage __visible void __sched schedule_user(void)		asmlinkage __visible void __sched schedule_user(void)
{								{
	/*								/*
	 * If we come here after a random call to set_need_re		 * If we come here after a random call to set_need_re
	 * or we have been woken up remotely but the IPI has 		 * or we have been woken up remotely but the IPI has 
	 * we haven't yet exited the RCU idle mode. Do it her		 * we haven't yet exited the RCU idle mode. Do it her
	 * we find a better solution.					 * we find a better solution.
	 *								 *
	 * NB: There are buggy callers of this function.  Ide		 * NB: There are buggy callers of this function.  Ide
	 * should warn if prev_state != CONTEXT_USER, but tha		 * should warn if prev_state != CONTEXT_USER, but tha
	 * too frequently to make sense yet.				 * too frequently to make sense yet.
	 */								 */
	enum ctx_state prev_state = exception_enter();			enum ctx_state prev_state = exception_enter();
	schedule();							schedule();
	exception_exit(prev_state);					exception_exit(prev_state);
}								}
#endif								#endif

/**								/**
 * schedule_preempt_disabled - called with preemption disable	 * schedule_preempt_disabled - called with preemption disable
 *								 *
 * Returns with preemption disabled. Note: preempt_count must	 * Returns with preemption disabled. Note: preempt_count must
 */								 */
void __sched schedule_preempt_disabled(void)			void __sched schedule_preempt_disabled(void)
{								{
	sched_preempt_enable_no_resched();				sched_preempt_enable_no_resched();
	schedule();							schedule();
	preempt_disable();						preempt_disable();
}								}

#ifdef CONFIG_PREEMPT_RT					#ifdef CONFIG_PREEMPT_RT
void __sched notrace schedule_rtlock(void)			void __sched notrace schedule_rtlock(void)
{								{
	__schedule_loop(SM_RTLOCK_WAIT);		      |		do {
							      >			preempt_disable();
							      >			__schedule(SM_RTLOCK_WAIT);
							      >			sched_preempt_enable_no_resched();
							      >		} while (need_resched());
}								}
NOKPROBE_SYMBOL(schedule_rtlock);				NOKPROBE_SYMBOL(schedule_rtlock);
#endif								#endif

static void __sched notrace preempt_schedule_common(void)	static void __sched notrace preempt_schedule_common(void)
{								{
	do {								do {
		/*								/*
		 * Because the function tracer can trace pree			 * Because the function tracer can trace pree
		 * and it also uses preempt_enable/disable_no			 * and it also uses preempt_enable/disable_no
		 * NEED_RESCHED is set, the preempt_enable_no			 * NEED_RESCHED is set, the preempt_enable_no
		 * by the function tracer will call this func			 * by the function tracer will call this func
		 * cause infinite recursion.					 * cause infinite recursion.
		 *								 *
		 * Preemption must be disabled here before th			 * Preemption must be disabled here before th
		 * tracer can trace. Break up preempt_disable			 * tracer can trace. Break up preempt_disable
		 * calls. One to disable preemption without f			 * calls. One to disable preemption without f
		 * traced. The other to still record the pree			 * traced. The other to still record the pree
		 * which can also be traced by the function t			 * which can also be traced by the function t
		 */								 */
		preempt_disable_notrace();					preempt_disable_notrace();
		preempt_latency_start(1);					preempt_latency_start(1);
		__schedule(SM_PREEMPT);						__schedule(SM_PREEMPT);
		preempt_latency_stop(1);					preempt_latency_stop(1);
		preempt_enable_no_resched_notrace();				preempt_enable_no_resched_notrace();

		/*								/*
		 * Check again in case we missed a preemption			 * Check again in case we missed a preemption
		 * between schedule and now.					 * between schedule and now.
		 */								 */
	} while (need_resched());					} while (need_resched());
}								}

#ifdef CONFIG_PREEMPTION					#ifdef CONFIG_PREEMPTION
/*								/*
 * This is the entry point to schedule() from in-kernel preem	 * This is the entry point to schedule() from in-kernel preem
 * off of preempt_enable.					 * off of preempt_enable.
 */								 */
asmlinkage __visible void __sched notrace preempt_schedule(vo	asmlinkage __visible void __sched notrace preempt_schedule(vo
{								{
	/*								/*
	 * If there is a non-zero preempt_count or interrupts		 * If there is a non-zero preempt_count or interrupts
	 * we do not want to preempt the current task. Just r		 * we do not want to preempt the current task. Just r
	 */								 */
	if (likely(!preemptible()))					if (likely(!preemptible()))
		return;								return;
	preempt_schedule_common();					preempt_schedule_common();
}								}
NOKPROBE_SYMBOL(preempt_schedule);				NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);				EXPORT_SYMBOL(preempt_schedule);

#ifdef CONFIG_PREEMPT_DYNAMIC					#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)			#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#ifndef preempt_schedule_dynamic_enabled			#ifndef preempt_schedule_dynamic_enabled
#define preempt_schedule_dynamic_enabled	preempt_sched	#define preempt_schedule_dynamic_enabled	preempt_sched
#define preempt_schedule_dynamic_disabled	NULL		#define preempt_schedule_dynamic_disabled	NULL
#endif								#endif
DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic	DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic
EXPORT_STATIC_CALL_TRAMP(preempt_schedule);			EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)			#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);	static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
void __sched notrace dynamic_preempt_schedule(void)		void __sched notrace dynamic_preempt_schedule(void)
{								{
	if (!static_branch_unlikely(&sk_dynamic_preempt_sched		if (!static_branch_unlikely(&sk_dynamic_preempt_sched
		return;								return;
	preempt_schedule();						preempt_schedule();
}								}
NOKPROBE_SYMBOL(dynamic_preempt_schedule);			NOKPROBE_SYMBOL(dynamic_preempt_schedule);
EXPORT_SYMBOL(dynamic_preempt_schedule);			EXPORT_SYMBOL(dynamic_preempt_schedule);
#endif								#endif
#endif								#endif

/**								/**
 * preempt_schedule_notrace - preempt_schedule called by trac	 * preempt_schedule_notrace - preempt_schedule called by trac
 *								 *
 * The tracing infrastructure uses preempt_enable_notrace to 	 * The tracing infrastructure uses preempt_enable_notrace to 
 * recursion and tracing preempt enabling caused by the traci	 * recursion and tracing preempt enabling caused by the traci
 * infrastructure itself. But as tracing can happen in areas 	 * infrastructure itself. But as tracing can happen in areas 
 * from userspace or just about to enter userspace, a preempt	 * from userspace or just about to enter userspace, a preempt
 * can occur before user_exit() is called. This will cause th	 * can occur before user_exit() is called. This will cause th
 * to be called when the system is still in usermode.		 * to be called when the system is still in usermode.
 *								 *
 * To prevent this, the preempt_enable_notrace will use this 	 * To prevent this, the preempt_enable_notrace will use this 
 * instead of preempt_schedule() to exit user context if need	 * instead of preempt_schedule() to exit user context if need
 * calling the scheduler.					 * calling the scheduler.
 */								 */
asmlinkage __visible void __sched notrace preempt_schedule_no	asmlinkage __visible void __sched notrace preempt_schedule_no
{								{
	enum ctx_state prev_ctx;					enum ctx_state prev_ctx;

	if (likely(!preemptible()))					if (likely(!preemptible()))
		return;								return;

	do {								do {
		/*								/*
		 * Because the function tracer can trace pree			 * Because the function tracer can trace pree
		 * and it also uses preempt_enable/disable_no			 * and it also uses preempt_enable/disable_no
		 * NEED_RESCHED is set, the preempt_enable_no			 * NEED_RESCHED is set, the preempt_enable_no
		 * by the function tracer will call this func			 * by the function tracer will call this func
		 * cause infinite recursion.					 * cause infinite recursion.
		 *								 *
		 * Preemption must be disabled here before th			 * Preemption must be disabled here before th
		 * tracer can trace. Break up preempt_disable			 * tracer can trace. Break up preempt_disable
		 * calls. One to disable preemption without f			 * calls. One to disable preemption without f
		 * traced. The other to still record the pree			 * traced. The other to still record the pree
		 * which can also be traced by the function t			 * which can also be traced by the function t
		 */								 */
		preempt_disable_notrace();					preempt_disable_notrace();
		preempt_latency_start(1);					preempt_latency_start(1);
		/*								/*
		 * Needs preempt disabled in case user_exit()			 * Needs preempt disabled in case user_exit()
		 * and the tracer calls preempt_enable_notrac			 * and the tracer calls preempt_enable_notrac
		 * an infinite recursion.					 * an infinite recursion.
		 */								 */
		prev_ctx = exception_enter();					prev_ctx = exception_enter();
		__schedule(SM_PREEMPT);						__schedule(SM_PREEMPT);
		exception_exit(prev_ctx);					exception_exit(prev_ctx);

		preempt_latency_stop(1);					preempt_latency_stop(1);
		preempt_enable_no_resched_notrace();				preempt_enable_no_resched_notrace();
	} while (need_resched());					} while (need_resched());
}								}
EXPORT_SYMBOL_GPL(preempt_schedule_notrace);			EXPORT_SYMBOL_GPL(preempt_schedule_notrace);

#ifdef CONFIG_PREEMPT_DYNAMIC					#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)			#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#ifndef preempt_schedule_notrace_dynamic_enabled		#ifndef preempt_schedule_notrace_dynamic_enabled
#define preempt_schedule_notrace_dynamic_enabled	preem	#define preempt_schedule_notrace_dynamic_enabled	preem
#define preempt_schedule_notrace_dynamic_disabled	NULL	#define preempt_schedule_notrace_dynamic_disabled	NULL
#endif								#endif
DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule	DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule
EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);		EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)			#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_not	static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_not
void __sched notrace dynamic_preempt_schedule_notrace(void)	void __sched notrace dynamic_preempt_schedule_notrace(void)
{								{
	if (!static_branch_unlikely(&sk_dynamic_preempt_sched		if (!static_branch_unlikely(&sk_dynamic_preempt_sched
		return;								return;
	preempt_schedule_notrace();					preempt_schedule_notrace();
}								}
NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);		NOKPROBE_SYMBOL(dynamic_preempt_schedule_notrace);
EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);		EXPORT_SYMBOL(dynamic_preempt_schedule_notrace);
#endif								#endif
#endif								#endif

#endif /* CONFIG_PREEMPTION */					#endif /* CONFIG_PREEMPTION */

/*								/*
 * This is the entry point to schedule() from kernel preempti	 * This is the entry point to schedule() from kernel preempti
 * off of irq context.						 * off of irq context.
 * Note, that this is called and return with irqs disabled. T	 * Note, that this is called and return with irqs disabled. T
 * protect us against recursive calling from irq.		 * protect us against recursive calling from irq.
 */								 */
asmlinkage __visible void __sched preempt_schedule_irq(void)	asmlinkage __visible void __sched preempt_schedule_irq(void)
{								{
	enum ctx_state prev_state;					enum ctx_state prev_state;

	/* Catch callers which need to be fixed */			/* Catch callers which need to be fixed */
	BUG_ON(preempt_count() || !irqs_disabled());			BUG_ON(preempt_count() || !irqs_disabled());

	prev_state = exception_enter();					prev_state = exception_enter();

	do {								do {
		preempt_disable();						preempt_disable();
		local_irq_enable();						local_irq_enable();
		__schedule(SM_PREEMPT);						__schedule(SM_PREEMPT);
		local_irq_disable();						local_irq_disable();
		sched_preempt_enable_no_resched();				sched_preempt_enable_no_resched();
	} while (need_resched());					} while (need_resched());

	exception_exit(prev_state);					exception_exit(prev_state);
}								}

int default_wake_function(wait_queue_entry_t *curr, unsigned 	int default_wake_function(wait_queue_entry_t *curr, unsigned 
			  void *key)							  void *key)
{								{
	WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_f		WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_f
	return try_to_wake_up(curr->private, mode, wake_flags		return try_to_wake_up(curr->private, mode, wake_flags
}								}
EXPORT_SYMBOL(default_wake_function);				EXPORT_SYMBOL(default_wake_function);

static void __setscheduler_prio(struct task_struct *p, int pr	static void __setscheduler_prio(struct task_struct *p, int pr
{								{
	if (dl_prio(prio))						if (dl_prio(prio))
		p->sched_class = &dl_sched_class;				p->sched_class = &dl_sched_class;
	else if (rt_prio(prio))						else if (rt_prio(prio))
		p->sched_class = &rt_sched_class;				p->sched_class = &rt_sched_class;
	else								else
		p->sched_class = &fair_sched_class;				p->sched_class = &fair_sched_class;

	p->prio = prio;							p->prio = prio;
}								}

#ifdef CONFIG_RT_MUTEXES					#ifdef CONFIG_RT_MUTEXES

/*							      <
 * Would be more useful with typeof()/auto_type but they don' <
 * bit-fields. Since it's a local thing, use int. Keep the ge <
 * name such that if someone were to implement this function  <
 * notes.						      <
 */							      <
#define fetch_and_set(x, v) ({ int _x = (x); (x) = (v); _x; } <
							      <
void rt_mutex_pre_schedule(void)			      <
{							      <
	lockdep_assert(!fetch_and_set(current->sched_rt_mutex <
	sched_submit_work(current);			      <
}							      <
							      <
void rt_mutex_schedule(void)				      <
{							      <
	lockdep_assert(current->sched_rt_mutex);	      <
	__schedule_loop(SM_NONE);			      <
}							      <
							      <
void rt_mutex_post_schedule(void)			      <
{							      <
	sched_update_worker(current);			      <
	lockdep_assert(fetch_and_set(current->sched_rt_mutex, <
}							      <
							      <
static inline int __rt_effective_prio(struct task_struct *pi_	static inline int __rt_effective_prio(struct task_struct *pi_
{								{
	if (pi_task)							if (pi_task)
		prio = min(prio, pi_task->prio);				prio = min(prio, pi_task->prio);

	return prio;							return prio;
}								}

static inline int rt_effective_prio(struct task_struct *p, in	static inline int rt_effective_prio(struct task_struct *p, in
{								{
	struct task_struct *pi_task = rt_mutex_get_top_task(p		struct task_struct *pi_task = rt_mutex_get_top_task(p

	return __rt_effective_prio(pi_task, prio);			return __rt_effective_prio(pi_task, prio);
}								}

/*								/*
 * rt_mutex_setprio - set the current priority of a task	 * rt_mutex_setprio - set the current priority of a task
 * @p: task to boost						 * @p: task to boost
 * @pi_task: donor task						 * @pi_task: donor task
 *								 *
 * This function changes the 'effective' priority of a task. 	 * This function changes the 'effective' priority of a task. 
 * not touch ->normal_prio like __setscheduler().		 * not touch ->normal_prio like __setscheduler().
 *								 *
 * Used by the rt_mutex code to implement priority inheritanc	 * Used by the rt_mutex code to implement priority inheritanc
 * logic. Call site only calls if the priority of the task ch	 * logic. Call site only calls if the priority of the task ch
 */								 */
void rt_mutex_setprio(struct task_struct *p, struct task_stru	void rt_mutex_setprio(struct task_struct *p, struct task_stru
{								{
	int prio, oldprio, queued, running, queue_flag =		int prio, oldprio, queued, running, queue_flag =
		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK			DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK
	const struct sched_class *prev_class;				const struct sched_class *prev_class;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	/* XXX used to be waiter->prio, not waiter->task->pri		/* XXX used to be waiter->prio, not waiter->task->pri
	prio = __rt_effective_prio(pi_task, p->normal_prio);		prio = __rt_effective_prio(pi_task, p->normal_prio);

	/*								/*
	 * If nothing changed; bail early.				 * If nothing changed; bail early.
	 */								 */
	if (p->pi_top_task == pi_task && prio == p->prio && !		if (p->pi_top_task == pi_task && prio == p->prio && !
		return;								return;

	rq = __task_rq_lock(p, &rf);					rq = __task_rq_lock(p, &rf);
	update_rq_clock(rq);						update_rq_clock(rq);
	/*								/*
	 * Set under pi_lock && rq->lock, such that the value		 * Set under pi_lock && rq->lock, such that the value
	 * either lock.							 * either lock.
	 *								 *
	 * Note that there is loads of tricky to make this po		 * Note that there is loads of tricky to make this po
	 * right. rt_mutex_slowunlock()+rt_mutex_postunlock()		 * right. rt_mutex_slowunlock()+rt_mutex_postunlock()
	 * ensure a task is de-boosted (pi_task is set to NUL		 * ensure a task is de-boosted (pi_task is set to NUL
	 * task is allowed to run again (and can exit). This 		 * task is allowed to run again (and can exit). This 
	 * points to a blocked task -- which guarantees the t		 * points to a blocked task -- which guarantees the t
	 */								 */
	p->pi_top_task = pi_task;					p->pi_top_task = pi_task;

	/*								/*
	 * For FIFO/RR we only need to set prio, if that matc		 * For FIFO/RR we only need to set prio, if that matc
	 */								 */
	if (prio == p->prio && !dl_prio(prio))				if (prio == p->prio && !dl_prio(prio))
		goto out_unlock;						goto out_unlock;

	/*								/*
	 * Idle task boosting is a nono in general. There is 		 * Idle task boosting is a nono in general. There is 
	 * exception, when PREEMPT_RT and NOHZ is active:		 * exception, when PREEMPT_RT and NOHZ is active:
	 *								 *
	 * The idle task calls get_next_timer_interrupt() and		 * The idle task calls get_next_timer_interrupt() and
	 * the timer wheel base->lock on the CPU and another 		 * the timer wheel base->lock on the CPU and another 
	 * to access the timer (probably to cancel it). We ca		 * to access the timer (probably to cancel it). We ca
	 * ignore the boosting request, as the idle CPU runs 		 * ignore the boosting request, as the idle CPU runs 
	 * with interrupts disabled and will complete the loc		 * with interrupts disabled and will complete the loc
	 * protected section without being interrupted. So th		 * protected section without being interrupted. So th
	 * real need to boost.						 * real need to boost.
	 */								 */
	if (unlikely(p == rq->idle)) {					if (unlikely(p == rq->idle)) {
		WARN_ON(p != rq->curr);						WARN_ON(p != rq->curr);
		WARN_ON(p->pi_blocked_on);					WARN_ON(p->pi_blocked_on);
		goto out_unlock;						goto out_unlock;
	}								}

	trace_sched_pi_setprio(p, pi_task);				trace_sched_pi_setprio(p, pi_task);
	oldprio = p->prio;						oldprio = p->prio;

	if (oldprio == prio)						if (oldprio == prio)
		queue_flag &= ~DEQUEUE_MOVE;					queue_flag &= ~DEQUEUE_MOVE;

	prev_class = p->sched_class;					prev_class = p->sched_class;
	queued = task_on_rq_queued(p);					queued = task_on_rq_queued(p);
	running = task_current(rq, p);					running = task_current(rq, p);
	if (queued)							if (queued)
		dequeue_task(rq, p, queue_flag);				dequeue_task(rq, p, queue_flag);
	if (running)							if (running)
		put_prev_task(rq, p);						put_prev_task(rq, p);

	/*								/*
	 * Boosting condition are:					 * Boosting condition are:
	 * 1. -rt task is running and holds mutex A			 * 1. -rt task is running and holds mutex A
	 *      --> -dl task blocks on mutex A				 *      --> -dl task blocks on mutex A
	 *								 *
	 * 2. -dl task is running and holds mutex A			 * 2. -dl task is running and holds mutex A
	 *      --> -dl task blocks on mutex A and could pree		 *      --> -dl task blocks on mutex A and could pree
	 *          running task					 *          running task
	 */								 */
	if (dl_prio(prio)) {						if (dl_prio(prio)) {
		if (!dl_prio(p->normal_prio) ||					if (!dl_prio(p->normal_prio) ||
		    (pi_task && dl_prio(pi_task->prio) &&			    (pi_task && dl_prio(pi_task->prio) &&
		     dl_entity_preempt(&pi_task->dl, &p->dl))			     dl_entity_preempt(&pi_task->dl, &p->dl))
			p->dl.pi_se = pi_task->dl.pi_se;				p->dl.pi_se = pi_task->dl.pi_se;
			queue_flag |= ENQUEUE_REPLENISH;				queue_flag |= ENQUEUE_REPLENISH;
		} else {							} else {
			p->dl.pi_se = &p->dl;						p->dl.pi_se = &p->dl;
		}								}
	} else if (rt_prio(prio)) {					} else if (rt_prio(prio)) {
		if (dl_prio(oldprio))						if (dl_prio(oldprio))
			p->dl.pi_se = &p->dl;						p->dl.pi_se = &p->dl;
		if (oldprio < prio)						if (oldprio < prio)
			queue_flag |= ENQUEUE_HEAD;					queue_flag |= ENQUEUE_HEAD;
	} else {							} else {
		if (dl_prio(oldprio))						if (dl_prio(oldprio))
			p->dl.pi_se = &p->dl;						p->dl.pi_se = &p->dl;
		if (rt_prio(oldprio))						if (rt_prio(oldprio))
			p->rt.timeout = 0;						p->rt.timeout = 0;
	}								}

	__setscheduler_prio(p, prio);					__setscheduler_prio(p, prio);

	if (queued)							if (queued)
		enqueue_task(rq, p, queue_flag);				enqueue_task(rq, p, queue_flag);
	if (running)							if (running)
		set_next_task(rq, p);						set_next_task(rq, p);

	check_class_changed(rq, p, prev_class, oldprio);		check_class_changed(rq, p, prev_class, oldprio);
out_unlock:							out_unlock:
	/* Avoid rq from going away on us: */				/* Avoid rq from going away on us: */
	preempt_disable();						preempt_disable();

	rq_unpin_lock(rq, &rf);						rq_unpin_lock(rq, &rf);
	__balance_callbacks(rq);					__balance_callbacks(rq);
	raw_spin_rq_unlock(rq);						raw_spin_rq_unlock(rq);

	preempt_enable();						preempt_enable();
}								}
#else								#else
static inline int rt_effective_prio(struct task_struct *p, in	static inline int rt_effective_prio(struct task_struct *p, in
{								{
	return prio;							return prio;
}								}
#endif								#endif

void set_user_nice(struct task_struct *p, long nice)		void set_user_nice(struct task_struct *p, long nice)
{								{
	bool queued, running;						bool queued, running;
	int old_prio;							int old_prio;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	if (task_nice(p) == nice || nice < MIN_NICE || nice >		if (task_nice(p) == nice || nice < MIN_NICE || nice >
		return;								return;
	/*								/*
	 * We have to be careful, if called from sys_setprior		 * We have to be careful, if called from sys_setprior
	 * the task might be in the middle of scheduling on a		 * the task might be in the middle of scheduling on a
	 */								 */
	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
	update_rq_clock(rq);						update_rq_clock(rq);

	/*								/*
	 * The RT priorities are set via sched_setscheduler()		 * The RT priorities are set via sched_setscheduler()
	 * allow the 'normal' nice value to be set - but as e		 * allow the 'normal' nice value to be set - but as e
	 * it won't have any effect on scheduling until the t		 * it won't have any effect on scheduling until the t
	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:			 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
	 */								 */
	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {		if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
		p->static_prio = NICE_TO_PRIO(nice);				p->static_prio = NICE_TO_PRIO(nice);
		goto out_unlock;						goto out_unlock;
	}								}
	queued = task_on_rq_queued(p);					queued = task_on_rq_queued(p);
	running = task_current(rq, p);					running = task_current(rq, p);
	if (queued)							if (queued)
		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO			dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NO
	if (running)							if (running)
		put_prev_task(rq, p);						put_prev_task(rq, p);

	p->static_prio = NICE_TO_PRIO(nice);				p->static_prio = NICE_TO_PRIO(nice);
	set_load_weight(p, true);					set_load_weight(p, true);
	old_prio = p->prio;						old_prio = p->prio;
	p->prio = effective_prio(p);					p->prio = effective_prio(p);

	if (queued)							if (queued)
		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE			enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE
	if (running)							if (running)
		set_next_task(rq, p);						set_next_task(rq, p);

	/*								/*
	 * If the task increased its priority or is running a		 * If the task increased its priority or is running a
	 * lowered its priority, then reschedule its CPU:		 * lowered its priority, then reschedule its CPU:
	 */								 */
	p->sched_class->prio_changed(rq, p, old_prio);			p->sched_class->prio_changed(rq, p, old_prio);

out_unlock:							out_unlock:
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
}								}
EXPORT_SYMBOL(set_user_nice);					EXPORT_SYMBOL(set_user_nice);

/*								/*
 * is_nice_reduction - check if nice value is an actual reduc	 * is_nice_reduction - check if nice value is an actual reduc
 *								 *
 * Similar to can_nice() but does not perform a capability ch	 * Similar to can_nice() but does not perform a capability ch
 *								 *
 * @p: task							 * @p: task
 * @nice: nice value						 * @nice: nice value
 */								 */
static bool is_nice_reduction(const struct task_struct *p, co	static bool is_nice_reduction(const struct task_struct *p, co
{								{
	/* Convert nice value [19,-20] to rlimit style value 		/* Convert nice value [19,-20] to rlimit style value 
	int nice_rlim = nice_to_rlimit(nice);				int nice_rlim = nice_to_rlimit(nice);

	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));		return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
}								}

/*								/*
 * can_nice - check if a task can reduce its nice value		 * can_nice - check if a task can reduce its nice value
 * @p: task							 * @p: task
 * @nice: nice value						 * @nice: nice value
 */								 */
int can_nice(const struct task_struct *p, const int nice)	int can_nice(const struct task_struct *p, const int nice)
{								{
	return is_nice_reduction(p, nice) || capable(CAP_SYS_		return is_nice_reduction(p, nice) || capable(CAP_SYS_
}								}

#ifdef __ARCH_WANT_SYS_NICE					#ifdef __ARCH_WANT_SYS_NICE

/*								/*
 * sys_nice - change the priority of the current process.	 * sys_nice - change the priority of the current process.
 * @increment: priority increment				 * @increment: priority increment
 *								 *
 * sys_setpriority is a more generic, but much slower functio	 * sys_setpriority is a more generic, but much slower functio
 * does similar things.						 * does similar things.
 */								 */
SYSCALL_DEFINE1(nice, int, increment)				SYSCALL_DEFINE1(nice, int, increment)
{								{
	long nice, retval;						long nice, retval;

	/*								/*
	 * Setpriority might change our priority at the same 		 * Setpriority might change our priority at the same 
	 * We don't have to worry. Conceptually one call occu		 * We don't have to worry. Conceptually one call occu
	 * and we have a single winner.					 * and we have a single winner.
	 */								 */
	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH)		increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH)
	nice = task_nice(current) + increment;				nice = task_nice(current) + increment;

	nice = clamp_val(nice, MIN_NICE, MAX_NICE);			nice = clamp_val(nice, MIN_NICE, MAX_NICE);
	if (increment < 0 && !can_nice(current, nice))			if (increment < 0 && !can_nice(current, nice))
		return -EPERM;							return -EPERM;

	retval = security_task_setnice(current, nice);			retval = security_task_setnice(current, nice);
	if (retval)							if (retval)
		return retval;							return retval;

	set_user_nice(current, nice);					set_user_nice(current, nice);
	return 0;							return 0;
}								}

#endif								#endif

/**								/**
 * task_prio - return the priority value of a given task.	 * task_prio - return the priority value of a given task.
 * @p: the task in question.					 * @p: the task in question.
 *								 *
 * Return: The priority value as seen by users in /proc.	 * Return: The priority value as seen by users in /proc.
 *								 *
 * sched policy         return value   kernel prio    user pr	 * sched policy         return value   kernel prio    user pr
 *								 *
 * normal, batch, idle     [0 ... 39]  [100 ... 139]         	 * normal, batch, idle     [0 ... 39]  [100 ... 139]         
 * fifo, rr             [-2 ... -100]     [98 ... 0]  [1 ... 	 * fifo, rr             [-2 ... -100]     [98 ... 0]  [1 ... 
 * deadline                     -101             -1          	 * deadline                     -101             -1          
 */								 */
int task_prio(const struct task_struct *p)			int task_prio(const struct task_struct *p)
{								{
	return p->prio - MAX_RT_PRIO;					return p->prio - MAX_RT_PRIO;
}								}

/**								/**
 * idle_cpu - is a given CPU idle currently?			 * idle_cpu - is a given CPU idle currently?
 * @cpu: the processor in question.				 * @cpu: the processor in question.
 *								 *
 * Return: 1 if the CPU is currently idle. 0 otherwise.		 * Return: 1 if the CPU is currently idle. 0 otherwise.
 */								 */
int idle_cpu(int cpu)						int idle_cpu(int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	if (rq->curr != rq->idle)					if (rq->curr != rq->idle)
		return 0;							return 0;

	if (rq->nr_running)						if (rq->nr_running)
		return 0;							return 0;

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	if (rq->ttwu_pending)						if (rq->ttwu_pending)
		return 0;							return 0;
#endif								#endif

	return 1;							return 1;
}								}

/**								/**
 * available_idle_cpu - is a given CPU idle for enqueuing wor	 * available_idle_cpu - is a given CPU idle for enqueuing wor
 * @cpu: the CPU in question.					 * @cpu: the CPU in question.
 *								 *
 * Return: 1 if the CPU is currently idle. 0 otherwise.		 * Return: 1 if the CPU is currently idle. 0 otherwise.
 */								 */
int available_idle_cpu(int cpu)					int available_idle_cpu(int cpu)
{								{
	if (!idle_cpu(cpu))						if (!idle_cpu(cpu))
		return 0;							return 0;

	if (vcpu_is_preempted(cpu))					if (vcpu_is_preempted(cpu))
		return 0;							return 0;

	return 1;							return 1;
}								}

/**								/**
 * idle_task - return the idle task for a given CPU.		 * idle_task - return the idle task for a given CPU.
 * @cpu: the processor in question.				 * @cpu: the processor in question.
 *								 *
 * Return: The idle task for the CPU @cpu.			 * Return: The idle task for the CPU @cpu.
 */								 */
struct task_struct *idle_task(int cpu)				struct task_struct *idle_task(int cpu)
{								{
	return cpu_rq(cpu)->idle;					return cpu_rq(cpu)->idle;
}								}

#ifdef CONFIG_SCHED_CORE					#ifdef CONFIG_SCHED_CORE
int sched_core_idle_cpu(int cpu)				int sched_core_idle_cpu(int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	if (sched_core_enabled(rq) && rq->curr == rq->idle)		if (sched_core_enabled(rq) && rq->curr == rq->idle)
		return 1;							return 1;

	return idle_cpu(cpu);						return idle_cpu(cpu);
}								}

#endif								#endif

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
/*								/*
 * This function computes an effective utilization for the gi	 * This function computes an effective utilization for the gi
 * used for frequency selection given the linear relation: f 	 * used for frequency selection given the linear relation: f 
 *								 *
 * The scheduler tracks the following metrics:			 * The scheduler tracks the following metrics:
 *								 *
 *   cpu_util_{cfs,rt,dl,irq}()					 *   cpu_util_{cfs,rt,dl,irq}()
 *   cpu_bw_dl()						 *   cpu_bw_dl()
 *								 *
 * Where the cfs,rt and dl util numbers are tracked with the 	 * Where the cfs,rt and dl util numbers are tracked with the 
 * synchronized windows and are thus directly comparable.	 * synchronized windows and are thus directly comparable.
 *								 *
 * The cfs,rt,dl utilization are the running times measured w	 * The cfs,rt,dl utilization are the running times measured w
 * which excludes things like IRQ and steal-time. These latte	 * which excludes things like IRQ and steal-time. These latte
 * in the irq utilization.					 * in the irq utilization.
 *								 *
 * The DL bandwidth number otoh is not a measured metric but 	 * The DL bandwidth number otoh is not a measured metric but 
 * based on the task model parameters and gives the minimal u	 * based on the task model parameters and gives the minimal u
 * required to meet deadlines.					 * required to meet deadlines.
 */								 */
unsigned long effective_cpu_util(int cpu, unsigned long util_	unsigned long effective_cpu_util(int cpu, unsigned long util_
				 enum cpu_util_type type,					 enum cpu_util_type type,
				 struct task_struct *p)						 struct task_struct *p)
{								{
	unsigned long dl_util, util, irq, max;				unsigned long dl_util, util, irq, max;
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	max = arch_scale_cpu_capacity(cpu);				max = arch_scale_cpu_capacity(cpu);

	if (!uclamp_is_used() &&					if (!uclamp_is_used() &&
	    type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->		    type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->
		return max;							return max;
	}								}

	/*								/*
	 * Early check to see if IRQ/steal time saturates the		 * Early check to see if IRQ/steal time saturates the
	 * because of inaccuracies in how we track these -- s		 * because of inaccuracies in how we track these -- s
	 * update_irq_load_avg().					 * update_irq_load_avg().
	 */								 */
	irq = cpu_util_irq(rq);						irq = cpu_util_irq(rq);
	if (unlikely(irq >= max))					if (unlikely(irq >= max))
		return max;							return max;

	/*								/*
	 * Because the time spend on RT/DL tasks is visible a		 * Because the time spend on RT/DL tasks is visible a
	 * CFS tasks and we use the same metric to track the 		 * CFS tasks and we use the same metric to track the 
	 * utilization (PELT windows are synchronized) we can		 * utilization (PELT windows are synchronized) we can
	 * to obtain the CPU's actual utilization.			 * to obtain the CPU's actual utilization.
	 *								 *
	 * CFS and RT utilization can be boosted or capped, d		 * CFS and RT utilization can be boosted or capped, d
	 * utilization clamp constraints requested by current		 * utilization clamp constraints requested by current
	 * tasks.							 * tasks.
	 * When there are no CFS RUNNABLE tasks, clamps are r		 * When there are no CFS RUNNABLE tasks, clamps are r
	 * frequency will be gracefully reduced with the util		 * frequency will be gracefully reduced with the util
	 */								 */
	util = util_cfs + cpu_util_rt(rq);				util = util_cfs + cpu_util_rt(rq);
	if (type == FREQUENCY_UTIL)					if (type == FREQUENCY_UTIL)
		util = uclamp_rq_util_with(rq, util, p);			util = uclamp_rq_util_with(rq, util, p);

	dl_util = cpu_util_dl(rq);					dl_util = cpu_util_dl(rq);

	/*								/*
	 * For frequency selection we do not make cpu_util_dl		 * For frequency selection we do not make cpu_util_dl
	 * of this sum because we want to use cpu_bw_dl() lat		 * of this sum because we want to use cpu_bw_dl() lat
	 * to check if the CFS+RT+DL sum is saturated (ie. no		 * to check if the CFS+RT+DL sum is saturated (ie. no
	 * that we select f_max when there is no idle time.		 * that we select f_max when there is no idle time.
	 *								 *
	 * NOTE: numerical errors or stop class might cause u		 * NOTE: numerical errors or stop class might cause u
	 * saturation when we should -- something for later.		 * saturation when we should -- something for later.
	 */								 */
	if (util + dl_util >= max)					if (util + dl_util >= max)
		return max;							return max;

	/*								/*
	 * OTOH, for energy computation we need the estimated		 * OTOH, for energy computation we need the estimated
	 * include util_dl and ignore dl_bw.				 * include util_dl and ignore dl_bw.
	 */								 */
	if (type == ENERGY_UTIL)					if (type == ENERGY_UTIL)
		util += dl_util;						util += dl_util;

	/*								/*
	 * There is still idle time; further improve the numb		 * There is still idle time; further improve the numb
	 * irq metric. Because IRQ/steal time is hidden from 		 * irq metric. Because IRQ/steal time is hidden from 
	 * need to scale the task numbers:				 * need to scale the task numbers:
	 *								 *
	 *              max - irq					 *              max - irq
	 *   U' = irq + --------- * U					 *   U' = irq + --------- * U
	 *                 max						 *                 max
	 */								 */
	util = scale_irq_capacity(util, irq, max);			util = scale_irq_capacity(util, irq, max);
	util += irq;							util += irq;

	/*								/*
	 * Bandwidth required by DEADLINE must always be gran		 * Bandwidth required by DEADLINE must always be gran
	 * FAIR and RT, we use blocked utilization of IDLE CP		 * FAIR and RT, we use blocked utilization of IDLE CP
	 * to gracefully reduce the frequency when no tasks s		 * to gracefully reduce the frequency when no tasks s
	 * periods of time.						 * periods of time.
	 *								 *
	 * Ideally we would like to set bw_dl as min/guarante		 * Ideally we would like to set bw_dl as min/guarante
	 * bw_dl as requested freq. However, cpufreq is not y		 * bw_dl as requested freq. However, cpufreq is not y
	 * an interface. So, we only do the latter for now.		 * an interface. So, we only do the latter for now.
	 */								 */
	if (type == FREQUENCY_UTIL)					if (type == FREQUENCY_UTIL)
		util += cpu_bw_dl(rq);						util += cpu_bw_dl(rq);

	return min(max, util);						return min(max, util);
}								}

unsigned long sched_cpu_util(int cpu)				unsigned long sched_cpu_util(int cpu)
{								{
	return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENE		return effective_cpu_util(cpu, cpu_util_cfs(cpu), ENE
}								}
#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */

/**								/**
 * find_process_by_pid - find a process with a matching PID v	 * find_process_by_pid - find a process with a matching PID v
 * @pid: the pid in question.					 * @pid: the pid in question.
 *								 *
 * The task of @pid, if found. %NULL otherwise.			 * The task of @pid, if found. %NULL otherwise.
 */								 */
static struct task_struct *find_process_by_pid(pid_t pid)	static struct task_struct *find_process_by_pid(pid_t pid)
{								{
	return pid ? find_task_by_vpid(pid) : current;			return pid ? find_task_by_vpid(pid) : current;
}								}

/*								/*
 * sched_setparam() passes in -1 for its policy, to let the f	 * sched_setparam() passes in -1 for its policy, to let the f
 * it calls know not to change it.				 * it calls know not to change it.
 */								 */
#define SETPARAM_POLICY	-1					#define SETPARAM_POLICY	-1

static void __setscheduler_params(struct task_struct *p,	static void __setscheduler_params(struct task_struct *p,
		const struct sched_attr *attr)					const struct sched_attr *attr)
{								{
	int policy = attr->sched_policy;				int policy = attr->sched_policy;

	if (policy == SETPARAM_POLICY)					if (policy == SETPARAM_POLICY)
		policy = p->policy;						policy = p->policy;

	p->policy = policy;						p->policy = policy;

	if (dl_policy(policy))						if (dl_policy(policy))
		__setparam_dl(p, attr);						__setparam_dl(p, attr);
	else if (fair_policy(policy))					else if (fair_policy(policy))
		p->static_prio = NICE_TO_PRIO(attr->sched_nic			p->static_prio = NICE_TO_PRIO(attr->sched_nic

	/*								/*
	 * __sched_setscheduler() ensures attr->sched_priorit		 * __sched_setscheduler() ensures attr->sched_priorit
	 * !rt_policy. Always setting this ensures that thing		 * !rt_policy. Always setting this ensures that thing
	 * getparam()/getattr() don't report silly values for		 * getparam()/getattr() don't report silly values for
	 */								 */
	p->rt_priority = attr->sched_priority;				p->rt_priority = attr->sched_priority;
	p->normal_prio = normal_prio(p);				p->normal_prio = normal_prio(p);
	set_load_weight(p, true);					set_load_weight(p, true);
}								}

/*								/*
 * Check the target process has a UID that matches the curren	 * Check the target process has a UID that matches the curren
 */								 */
static bool check_same_owner(struct task_struct *p)		static bool check_same_owner(struct task_struct *p)
{								{
	const struct cred *cred = current_cred(), *pcred;		const struct cred *cred = current_cred(), *pcred;
	bool match;							bool match;

	rcu_read_lock();						rcu_read_lock();
	pcred = __task_cred(p);						pcred = __task_cred(p);
	match = (uid_eq(cred->euid, pcred->euid) ||			match = (uid_eq(cred->euid, pcred->euid) ||
		 uid_eq(cred->euid, pcred->uid));				 uid_eq(cred->euid, pcred->uid));
	rcu_read_unlock();						rcu_read_unlock();
	return match;							return match;
}								}

/*								/*
 * Allow unprivileged RT tasks to decrease priority.		 * Allow unprivileged RT tasks to decrease priority.
 * Only issue a capable test if needed and only once to avoid	 * Only issue a capable test if needed and only once to avoid
 * event on permitted non-privileged operations:		 * event on permitted non-privileged operations:
 */								 */
static int user_check_sched_setscheduler(struct task_struct *	static int user_check_sched_setscheduler(struct task_struct *
					 const struct sched_a						 const struct sched_a
					 int policy, int rese						 int policy, int rese
{								{
	if (fair_policy(policy)) {					if (fair_policy(policy)) {
		if (attr->sched_nice < task_nice(p) &&				if (attr->sched_nice < task_nice(p) &&
		    !is_nice_reduction(p, attr->sched_nice))			    !is_nice_reduction(p, attr->sched_nice))
			goto req_priv;							goto req_priv;
	}								}

	if (rt_policy(policy)) {					if (rt_policy(policy)) {
		unsigned long rlim_rtprio = task_rlimit(p, RL			unsigned long rlim_rtprio = task_rlimit(p, RL

		/* Can't set/change the rt policy: */				/* Can't set/change the rt policy: */
		if (policy != p->policy && !rlim_rtprio)			if (policy != p->policy && !rlim_rtprio)
			goto req_priv;							goto req_priv;

		/* Can't increase priority: */					/* Can't increase priority: */
		if (attr->sched_priority > p->rt_priority &&			if (attr->sched_priority > p->rt_priority &&
		    attr->sched_priority > rlim_rtprio)				    attr->sched_priority > rlim_rtprio)
			goto req_priv;							goto req_priv;
	}								}

	/*								/*
	 * Can't set/change SCHED_DEADLINE policy at all for 		 * Can't set/change SCHED_DEADLINE policy at all for 
	 * (safest behavior); in the future we would like to 		 * (safest behavior); in the future we would like to 
	 * unprivileged DL tasks to increase their relative d		 * unprivileged DL tasks to increase their relative d
	 * or reduce their runtime (both ways reducing utiliz		 * or reduce their runtime (both ways reducing utiliz
	 */								 */
	if (dl_policy(policy))						if (dl_policy(policy))
		goto req_priv;							goto req_priv;

	/*								/*
	 * Treat SCHED_IDLE as nice 20. Only allow a switch t		 * Treat SCHED_IDLE as nice 20. Only allow a switch t
	 * SCHED_NORMAL if the RLIMIT_NICE would normally per		 * SCHED_NORMAL if the RLIMIT_NICE would normally per
	 */								 */
	if (task_has_idle_policy(p) && !idle_policy(policy)) 		if (task_has_idle_policy(p) && !idle_policy(policy)) 
		if (!is_nice_reduction(p, task_nice(p)))			if (!is_nice_reduction(p, task_nice(p)))
			goto req_priv;							goto req_priv;
	}								}

	/* Can't change other user's priorities: */			/* Can't change other user's priorities: */
	if (!check_same_owner(p))					if (!check_same_owner(p))
		goto req_priv;							goto req_priv;

	/* Normal users shall not reset the sched_reset_on_fo		/* Normal users shall not reset the sched_reset_on_fo
	if (p->sched_reset_on_fork && !reset_on_fork)			if (p->sched_reset_on_fork && !reset_on_fork)
		goto req_priv;							goto req_priv;

	return 0;							return 0;

req_priv:							req_priv:
	if (!capable(CAP_SYS_NICE))					if (!capable(CAP_SYS_NICE))
		return -EPERM;							return -EPERM;

	return 0;							return 0;
}								}

static int __sched_setscheduler(struct task_struct *p,		static int __sched_setscheduler(struct task_struct *p,
				const struct sched_attr *attr					const struct sched_attr *attr
				bool user, bool pi)						bool user, bool pi)
{								{
	int oldpolicy = -1, policy = attr->sched_policy;		int oldpolicy = -1, policy = attr->sched_policy;
	int retval, oldprio, newprio, queued, running;			int retval, oldprio, newprio, queued, running;
	const struct sched_class *prev_class;				const struct sched_class *prev_class;
	struct balance_callback *head;					struct balance_callback *head;
	struct rq_flags rf;						struct rq_flags rf;
	int reset_on_fork;						int reset_on_fork;
	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUE		int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUE
	struct rq *rq;							struct rq *rq;
	bool cpuset_locked = false;					bool cpuset_locked = false;

	/* The pi code expects interrupts enabled */			/* The pi code expects interrupts enabled */
	BUG_ON(pi && in_interrupt());					BUG_ON(pi && in_interrupt());
recheck:							recheck:
	/* Double check policy once rq lock held: */			/* Double check policy once rq lock held: */
	if (policy < 0) {						if (policy < 0) {
		reset_on_fork = p->sched_reset_on_fork;				reset_on_fork = p->sched_reset_on_fork;
		policy = oldpolicy = p->policy;					policy = oldpolicy = p->policy;
	} else {							} else {
		reset_on_fork = !!(attr->sched_flags & SCHED_			reset_on_fork = !!(attr->sched_flags & SCHED_

		if (!valid_policy(policy))					if (!valid_policy(policy))
			return -EINVAL;							return -EINVAL;
	}								}

	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG		if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG
		return -EINVAL;							return -EINVAL;

	/*								/*
	 * Valid priorities for SCHED_FIFO and SCHED_RR are		 * Valid priorities for SCHED_FIFO and SCHED_RR are
	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,		 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
	 * SCHED_BATCH and SCHED_IDLE is 0.				 * SCHED_BATCH and SCHED_IDLE is 0.
	 */								 */
	if (attr->sched_priority > MAX_RT_PRIO-1)			if (attr->sched_priority > MAX_RT_PRIO-1)
		return -EINVAL;							return -EINVAL;
	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||		if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
	    (rt_policy(policy) != (attr->sched_priority != 0)		    (rt_policy(policy) != (attr->sched_priority != 0)
		return -EINVAL;							return -EINVAL;

	if (user) {							if (user) {
		retval = user_check_sched_setscheduler(p, att			retval = user_check_sched_setscheduler(p, att
		if (retval)							if (retval)
			return retval;							return retval;

		if (attr->sched_flags & SCHED_FLAG_SUGOV)			if (attr->sched_flags & SCHED_FLAG_SUGOV)
			return -EINVAL;							return -EINVAL;

		retval = security_task_setscheduler(p);				retval = security_task_setscheduler(p);
		if (retval)							if (retval)
			return retval;							return retval;
	}								}

	/* Update task specific "requested" clamps */			/* Update task specific "requested" clamps */
	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
		retval = uclamp_validate(p, attr);				retval = uclamp_validate(p, attr);
		if (retval)							if (retval)
			return retval;							return retval;
	}								}

	/*								/*
	 * SCHED_DEADLINE bandwidth accounting relies on stab		 * SCHED_DEADLINE bandwidth accounting relies on stab
	 * information.							 * information.
	 */								 */
	if (dl_policy(policy) || dl_policy(p->policy)) {		if (dl_policy(policy) || dl_policy(p->policy)) {
		cpuset_locked = true;						cpuset_locked = true;
		cpuset_lock();							cpuset_lock();
	}								}

	/*								/*
	 * Make sure no PI-waiters arrive (or leave) while we		 * Make sure no PI-waiters arrive (or leave) while we
	 * changing the priority of the task:				 * changing the priority of the task:
	 *								 *
	 * To be able to change p->policy safely, the appropr		 * To be able to change p->policy safely, the appropr
	 * runqueue lock must be held.					 * runqueue lock must be held.
	 */								 */
	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
	update_rq_clock(rq);						update_rq_clock(rq);

	/*								/*
	 * Changing the policy of the stop threads its a very		 * Changing the policy of the stop threads its a very
	 */								 */
	if (p == rq->stop) {						if (p == rq->stop) {
		retval = -EINVAL;						retval = -EINVAL;
		goto unlock;							goto unlock;
	}								}

	/*								/*
	 * If not changing anything there's no need to procee		 * If not changing anything there's no need to procee
	 * but store a possible modification of reset_on_fork		 * but store a possible modification of reset_on_fork
	 */								 */
	if (unlikely(policy == p->policy)) {				if (unlikely(policy == p->policy)) {
		if (fair_policy(policy) && attr->sched_nice !			if (fair_policy(policy) && attr->sched_nice !
			goto change;							goto change;
		if (rt_policy(policy) && attr->sched_priority			if (rt_policy(policy) && attr->sched_priority
			goto change;							goto change;
		if (dl_policy(policy) && dl_param_changed(p, 			if (dl_policy(policy) && dl_param_changed(p, 
			goto change;							goto change;
		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP			if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP
			goto change;							goto change;

		p->sched_reset_on_fork = reset_on_fork;				p->sched_reset_on_fork = reset_on_fork;
		retval = 0;							retval = 0;
		goto unlock;							goto unlock;
	}								}
change:								change:

	if (user) {							if (user) {
#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
		/*								/*
		 * Do not allow realtime tasks into groups th			 * Do not allow realtime tasks into groups th
		 * assigned.							 * assigned.
		 */								 */
		if (rt_bandwidth_enabled() && rt_policy(polic			if (rt_bandwidth_enabled() && rt_policy(polic
				task_group(p)->rt_bandwidth.r					task_group(p)->rt_bandwidth.r
				!task_group_is_autogroup(task					!task_group_is_autogroup(task
			retval = -EPERM;						retval = -EPERM;
			goto unlock;							goto unlock;
		}								}
#endif								#endif
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
		if (dl_bandwidth_enabled() && dl_policy(polic			if (dl_bandwidth_enabled() && dl_policy(polic
				!(attr->sched_flags & SCHED_F					!(attr->sched_flags & SCHED_F
			cpumask_t *span = rq->rd->span;					cpumask_t *span = rq->rd->span;

			/*								/*
			 * Don't allow tasks with an affinity				 * Don't allow tasks with an affinity
			 * the entire root_domain to become S				 * the entire root_domain to become S
			 * will also fail if there's no bandw				 * will also fail if there's no bandw
			 */								 */
			if (!cpumask_subset(span, p->cpus_ptr				if (!cpumask_subset(span, p->cpus_ptr
			    rq->rd->dl_bw.bw == 0) {					    rq->rd->dl_bw.bw == 0) {
				retval = -EPERM;						retval = -EPERM;
				goto unlock;							goto unlock;
			}								}
		}								}
#endif								#endif
	}								}

	/* Re-check policy now with rq lock held: */			/* Re-check policy now with rq lock held: */
	if (unlikely(oldpolicy != -1 && oldpolicy != p->polic		if (unlikely(oldpolicy != -1 && oldpolicy != p->polic
		policy = oldpolicy = -1;					policy = oldpolicy = -1;
		task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
		if (cpuset_locked)						if (cpuset_locked)
			cpuset_unlock();						cpuset_unlock();
		goto recheck;							goto recheck;
	}								}

	/*								/*
	 * If setscheduling to SCHED_DEADLINE (or changing th		 * If setscheduling to SCHED_DEADLINE (or changing th
	 * of a SCHED_DEADLINE task) we need to check if enou		 * of a SCHED_DEADLINE task) we need to check if enou
	 * is available.						 * is available.
	 */								 */
	if ((dl_policy(policy) || dl_task(p)) && sched_dl_ove		if ((dl_policy(policy) || dl_task(p)) && sched_dl_ove
		retval = -EBUSY;						retval = -EBUSY;
		goto unlock;							goto unlock;
	}								}

	p->sched_reset_on_fork = reset_on_fork;				p->sched_reset_on_fork = reset_on_fork;
	oldprio = p->prio;						oldprio = p->prio;

	newprio = __normal_prio(policy, attr->sched_priority,		newprio = __normal_prio(policy, attr->sched_priority,
	if (pi) {							if (pi) {
		/*								/*
		 * Take priority boosted tasks into account. 			 * Take priority boosted tasks into account. 
		 * effective priority is unchanged, we just s			 * effective priority is unchanged, we just s
		 * normal parameters and do not touch the sch			 * normal parameters and do not touch the sch
		 * the runqueue. This will be done when the t			 * the runqueue. This will be done when the t
		 * itself.							 * itself.
		 */								 */
		newprio = rt_effective_prio(p, newprio);			newprio = rt_effective_prio(p, newprio);
		if (newprio == oldprio)						if (newprio == oldprio)
			queue_flags &= ~DEQUEUE_MOVE;					queue_flags &= ~DEQUEUE_MOVE;
	}								}

	queued = task_on_rq_queued(p);					queued = task_on_rq_queued(p);
	running = task_current(rq, p);					running = task_current(rq, p);
	if (queued)							if (queued)
		dequeue_task(rq, p, queue_flags);				dequeue_task(rq, p, queue_flags);
	if (running)							if (running)
		put_prev_task(rq, p);						put_prev_task(rq, p);

	prev_class = p->sched_class;					prev_class = p->sched_class;

	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {		if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
		__setscheduler_params(p, attr);					__setscheduler_params(p, attr);
		__setscheduler_prio(p, newprio);				__setscheduler_prio(p, newprio);
	}								}
	__setscheduler_uclamp(p, attr);					__setscheduler_uclamp(p, attr);

	if (queued) {							if (queued) {
		/*								/*
		 * We enqueue to tail when the priority of a 			 * We enqueue to tail when the priority of a 
		 * increased (user space view).					 * increased (user space view).
		 */								 */
		if (oldprio < p->prio)						if (oldprio < p->prio)
			queue_flags |= ENQUEUE_HEAD;					queue_flags |= ENQUEUE_HEAD;

		enqueue_task(rq, p, queue_flags);				enqueue_task(rq, p, queue_flags);
	}								}
	if (running)							if (running)
		set_next_task(rq, p);						set_next_task(rq, p);

	check_class_changed(rq, p, prev_class, oldprio);		check_class_changed(rq, p, prev_class, oldprio);

	/* Avoid rq from going away on us: */				/* Avoid rq from going away on us: */
	preempt_disable();						preempt_disable();
	head = splice_balance_callbacks(rq);				head = splice_balance_callbacks(rq);
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);

	if (pi) {							if (pi) {
		if (cpuset_locked)						if (cpuset_locked)
			cpuset_unlock();						cpuset_unlock();
		rt_mutex_adjust_pi(p);						rt_mutex_adjust_pi(p);
	}								}

	/* Run balance callbacks after we've adjusted the PI 		/* Run balance callbacks after we've adjusted the PI 
	balance_callbacks(rq, head);					balance_callbacks(rq, head);
	preempt_enable();						preempt_enable();

	return 0;							return 0;

unlock:								unlock:
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
	if (cpuset_locked)						if (cpuset_locked)
		cpuset_unlock();						cpuset_unlock();
	return retval;							return retval;
}								}

static int _sched_setscheduler(struct task_struct *p, int pol	static int _sched_setscheduler(struct task_struct *p, int pol
			       const struct sched_param *para				       const struct sched_param *para
{								{
	struct sched_attr attr = {					struct sched_attr attr = {
		.sched_policy   = policy,					.sched_policy   = policy,
		.sched_priority = param->sched_priority,			.sched_priority = param->sched_priority,
		.sched_nice	= PRIO_TO_NICE(p->static_prio			.sched_nice	= PRIO_TO_NICE(p->static_prio
	};								};

	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */		/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RE		if ((policy != SETPARAM_POLICY) && (policy & SCHED_RE
		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;			attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
		policy &= ~SCHED_RESET_ON_FORK;					policy &= ~SCHED_RESET_ON_FORK;
		attr.sched_policy = policy;					attr.sched_policy = policy;
	}								}

	return __sched_setscheduler(p, &attr, check, true);		return __sched_setscheduler(p, &attr, check, true);
}								}
/**								/**
 * sched_setscheduler - change the scheduling policy and/or R	 * sched_setscheduler - change the scheduling policy and/or R
 * @p: the task in question.					 * @p: the task in question.
 * @policy: new policy.						 * @policy: new policy.
 * @param: structure containing the new RT priority.		 * @param: structure containing the new RT priority.
 *								 *
 * Use sched_set_fifo(), read its comment.			 * Use sched_set_fifo(), read its comment.
 *								 *
 * Return: 0 on success. An error code otherwise.		 * Return: 0 on success. An error code otherwise.
 *								 *
 * NOTE that the task may be already dead.			 * NOTE that the task may be already dead.
 */								 */
int sched_setscheduler(struct task_struct *p, int policy,	int sched_setscheduler(struct task_struct *p, int policy,
		       const struct sched_param *param)				       const struct sched_param *param)
{								{
	return _sched_setscheduler(p, policy, param, true);		return _sched_setscheduler(p, policy, param, true);
}								}

int sched_setattr(struct task_struct *p, const struct sched_a	int sched_setattr(struct task_struct *p, const struct sched_a
{								{
	return __sched_setscheduler(p, attr, true, true);		return __sched_setscheduler(p, attr, true, true);
}								}

int sched_setattr_nocheck(struct task_struct *p, const struct	int sched_setattr_nocheck(struct task_struct *p, const struct
{								{
	return __sched_setscheduler(p, attr, false, true);		return __sched_setscheduler(p, attr, false, true);
}								}
EXPORT_SYMBOL_GPL(sched_setattr_nocheck);			EXPORT_SYMBOL_GPL(sched_setattr_nocheck);

/**								/**
 * sched_setscheduler_nocheck - change the scheduling policy 	 * sched_setscheduler_nocheck - change the scheduling policy 
 * @p: the task in question.					 * @p: the task in question.
 * @policy: new policy.						 * @policy: new policy.
 * @param: structure containing the new RT priority.		 * @param: structure containing the new RT priority.
 *								 *
 * Just like sched_setscheduler, only don't bother checking i	 * Just like sched_setscheduler, only don't bother checking i
 * current context has permission.  For example, this is need	 * current context has permission.  For example, this is need
 * stop_machine(): we create temporary high priority worker t	 * stop_machine(): we create temporary high priority worker t
 * but our caller might not have that capability.		 * but our caller might not have that capability.
 *								 *
 * Return: 0 on success. An error code otherwise.		 * Return: 0 on success. An error code otherwise.
 */								 */
int sched_setscheduler_nocheck(struct task_struct *p, int pol	int sched_setscheduler_nocheck(struct task_struct *p, int pol
			       const struct sched_param *para				       const struct sched_param *para
{								{
	return _sched_setscheduler(p, policy, param, false);		return _sched_setscheduler(p, policy, param, false);
}								}

/*								/*
 * SCHED_FIFO is a broken scheduler model; that is, it is fun	 * SCHED_FIFO is a broken scheduler model; that is, it is fun
 * incapable of resource management, which is the one thing a	 * incapable of resource management, which is the one thing a
 * be doing.							 * be doing.
 *								 *
 * This is of course the reason it is limited to privileged u	 * This is of course the reason it is limited to privileged u
 *								 *
 * Worse still; it is fundamentally impossible to compose sta	 * Worse still; it is fundamentally impossible to compose sta
 * workloads. You cannot take two correctly working static pr	 * workloads. You cannot take two correctly working static pr
 * and smash them together and still expect them to work.	 * and smash them together and still expect them to work.
 *								 *
 * For this reason 'all' FIFO tasks the kernel creates are ba	 * For this reason 'all' FIFO tasks the kernel creates are ba
 *								 *
 *   MAX_RT_PRIO / 2						 *   MAX_RT_PRIO / 2
 *								 *
 * The administrator _MUST_ configure the system, the kernel 	 * The administrator _MUST_ configure the system, the kernel 
 * know enough information to make a sensible choice.		 * know enough information to make a sensible choice.
 */								 */
void sched_set_fifo(struct task_struct *p)			void sched_set_fifo(struct task_struct *p)
{								{
	struct sched_param sp = { .sched_priority = MAX_RT_PR		struct sched_param sp = { .sched_priority = MAX_RT_PR
	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO		WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO
}								}
EXPORT_SYMBOL_GPL(sched_set_fifo);				EXPORT_SYMBOL_GPL(sched_set_fifo);

/*								/*
 * For when you don't much care about FIFO, but want to be ab	 * For when you don't much care about FIFO, but want to be ab
 */								 */
void sched_set_fifo_low(struct task_struct *p)			void sched_set_fifo_low(struct task_struct *p)
{								{
	struct sched_param sp = { .sched_priority = 1 };		struct sched_param sp = { .sched_priority = 1 };
	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO		WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO
}								}
EXPORT_SYMBOL_GPL(sched_set_fifo_low);				EXPORT_SYMBOL_GPL(sched_set_fifo_low);

void sched_set_normal(struct task_struct *p, int nice)		void sched_set_normal(struct task_struct *p, int nice)
{								{
	struct sched_attr attr = {					struct sched_attr attr = {
		.sched_policy = SCHED_NORMAL,					.sched_policy = SCHED_NORMAL,
		.sched_nice = nice,						.sched_nice = nice,
	};								};
	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);		WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
}								}
EXPORT_SYMBOL_GPL(sched_set_normal);				EXPORT_SYMBOL_GPL(sched_set_normal);

static int							static int
do_sched_setscheduler(pid_t pid, int policy, struct sched_par	do_sched_setscheduler(pid_t pid, int policy, struct sched_par
{								{
	struct sched_param lparam;					struct sched_param lparam;
	struct task_struct *p;						struct task_struct *p;
	int retval;							int retval;

	if (!param || pid < 0)						if (!param || pid < 0)
		return -EINVAL;							return -EINVAL;
	if (copy_from_user(&lparam, param, sizeof(struct sche		if (copy_from_user(&lparam, param, sizeof(struct sche
		return -EFAULT;							return -EFAULT;

	rcu_read_lock();						rcu_read_lock();
	retval = -ESRCH;						retval = -ESRCH;
	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	if (likely(p))							if (likely(p))
		get_task_struct(p);						get_task_struct(p);
	rcu_read_unlock();						rcu_read_unlock();

	if (likely(p)) {						if (likely(p)) {
		retval = sched_setscheduler(p, policy, &lpara			retval = sched_setscheduler(p, policy, &lpara
		put_task_struct(p);						put_task_struct(p);
	}								}

	return retval;							return retval;
}								}

/*								/*
 * Mimics kernel/events/core.c perf_copy_attr().		 * Mimics kernel/events/core.c perf_copy_attr().
 */								 */
static int sched_copy_attr(struct sched_attr __user *uattr, s	static int sched_copy_attr(struct sched_attr __user *uattr, s
{								{
	u32 size;							u32 size;
	int ret;							int ret;

	/* Zero the full structure, so that a short copy will		/* Zero the full structure, so that a short copy will
	memset(attr, 0, sizeof(*attr));					memset(attr, 0, sizeof(*attr));

	ret = get_user(size, &uattr->size);				ret = get_user(size, &uattr->size);
	if (ret)							if (ret)
		return ret;							return ret;

	/* ABI compatibility quirk: */					/* ABI compatibility quirk: */
	if (!size)							if (!size)
		size = SCHED_ATTR_SIZE_VER0;					size = SCHED_ATTR_SIZE_VER0;
	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)		if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
		goto err_size;							goto err_size;

	ret = copy_struct_from_user(attr, sizeof(*attr), uatt		ret = copy_struct_from_user(attr, sizeof(*attr), uatt
	if (ret) {							if (ret) {
		if (ret == -E2BIG)						if (ret == -E2BIG)
			goto err_size;							goto err_size;
		return ret;							return ret;
	}								}

	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&		if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
	    size < SCHED_ATTR_SIZE_VER1)				    size < SCHED_ATTR_SIZE_VER1)
		return -EINVAL;							return -EINVAL;

	/*								/*
	 * XXX: Do we want to be lenient like existing syscal		 * XXX: Do we want to be lenient like existing syscal
	 * to be strict and return an error on out-of-bounds 		 * to be strict and return an error on out-of-bounds 
	 */								 */
	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, 		attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, 

	return 0;							return 0;

err_size:							err_size:
	put_user(sizeof(*attr), &uattr->size);				put_user(sizeof(*attr), &uattr->size);
	return -E2BIG;							return -E2BIG;
}								}

static void get_params(struct task_struct *p, struct sched_at	static void get_params(struct task_struct *p, struct sched_at
{								{
	if (task_has_dl_policy(p))					if (task_has_dl_policy(p))
		__getparam_dl(p, attr);						__getparam_dl(p, attr);
	else if (task_has_rt_policy(p))					else if (task_has_rt_policy(p))
		attr->sched_priority = p->rt_priority;				attr->sched_priority = p->rt_priority;
	else								else
		attr->sched_nice = task_nice(p);				attr->sched_nice = task_nice(p);
}								}

/**								/**
 * sys_sched_setscheduler - set/change the scheduler policy a	 * sys_sched_setscheduler - set/change the scheduler policy a
 * @pid: the pid in question.					 * @pid: the pid in question.
 * @policy: new policy.						 * @policy: new policy.
 * @param: structure containing the new RT priority.		 * @param: structure containing the new RT priority.
 *								 *
 * Return: 0 on success. An error code otherwise.		 * Return: 0 on success. An error code otherwise.
 */								 */
SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 	SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, 
{								{
	if (policy < 0)							if (policy < 0)
		return -EINVAL;							return -EINVAL;

	return do_sched_setscheduler(pid, policy, param);		return do_sched_setscheduler(pid, policy, param);
}								}

/**								/**
 * sys_sched_setparam - set/change the RT priority of a threa	 * sys_sched_setparam - set/change the RT priority of a threa
 * @pid: the pid in question.					 * @pid: the pid in question.
 * @param: structure containing the new RT priority.		 * @param: structure containing the new RT priority.
 *								 *
 * Return: 0 on success. An error code otherwise.		 * Return: 0 on success. An error code otherwise.
 */								 */
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_para	SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_para
{								{
	return do_sched_setscheduler(pid, SETPARAM_POLICY, pa		return do_sched_setscheduler(pid, SETPARAM_POLICY, pa
}								}

/**								/**
 * sys_sched_setattr - same as above, but with extended sched	 * sys_sched_setattr - same as above, but with extended sched
 * @pid: the pid in question.					 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.	 * @uattr: structure containing the extended parameters.
 * @flags: for future extension.				 * @flags: for future extension.
 */								 */
SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr 	SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr 
			       unsigned int, flags)					       unsigned int, flags)
{								{
	struct sched_attr attr;						struct sched_attr attr;
	struct task_struct *p;						struct task_struct *p;
	int retval;							int retval;

	if (!uattr || pid < 0 || flags)					if (!uattr || pid < 0 || flags)
		return -EINVAL;							return -EINVAL;

	retval = sched_copy_attr(uattr, &attr);				retval = sched_copy_attr(uattr, &attr);
	if (retval)							if (retval)
		return retval;							return retval;

	if ((int)attr.sched_policy < 0)					if ((int)attr.sched_policy < 0)
		return -EINVAL;							return -EINVAL;
	if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)			if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
		attr.sched_policy = SETPARAM_POLICY;				attr.sched_policy = SETPARAM_POLICY;

	rcu_read_lock();						rcu_read_lock();
	retval = -ESRCH;						retval = -ESRCH;
	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	if (likely(p))							if (likely(p))
		get_task_struct(p);						get_task_struct(p);
	rcu_read_unlock();						rcu_read_unlock();

	if (likely(p)) {						if (likely(p)) {
		if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS			if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS
			get_params(p, &attr);						get_params(p, &attr);
		retval = sched_setattr(p, &attr);				retval = sched_setattr(p, &attr);
		put_task_struct(p);						put_task_struct(p);
	}								}

	return retval;							return retval;
}								}

/**								/**
 * sys_sched_getscheduler - get the policy (scheduling class)	 * sys_sched_getscheduler - get the policy (scheduling class)
 * @pid: the pid in question.					 * @pid: the pid in question.
 *								 *
 * Return: On success, the policy of the thread. Otherwise, a	 * Return: On success, the policy of the thread. Otherwise, a
 * code.							 * code.
 */								 */
SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)			SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
{								{
	struct task_struct *p;						struct task_struct *p;
	int retval;							int retval;

	if (pid < 0)							if (pid < 0)
		return -EINVAL;							return -EINVAL;

	retval = -ESRCH;						retval = -ESRCH;
	rcu_read_lock();						rcu_read_lock();
	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	if (p) {							if (p) {
		retval = security_task_getscheduler(p);				retval = security_task_getscheduler(p);
		if (!retval)							if (!retval)
			retval = p->policy						retval = p->policy
				| (p->sched_reset_on_fork ? S					| (p->sched_reset_on_fork ? S
	}								}
	rcu_read_unlock();						rcu_read_unlock();
	return retval;							return retval;
}								}

/**								/**
 * sys_sched_getparam - get the RT priority of a thread		 * sys_sched_getparam - get the RT priority of a thread
 * @pid: the pid in question.					 * @pid: the pid in question.
 * @param: structure containing the RT priority.		 * @param: structure containing the RT priority.
 *								 *
 * Return: On success, 0 and the RT priority is in @param. Ot	 * Return: On success, 0 and the RT priority is in @param. Ot
 * code.							 * code.
 */								 */
SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_para	SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_para
{								{
	struct sched_param lp = { .sched_priority = 0 };		struct sched_param lp = { .sched_priority = 0 };
	struct task_struct *p;						struct task_struct *p;
	int retval;							int retval;

	if (!param || pid < 0)						if (!param || pid < 0)
		return -EINVAL;							return -EINVAL;

	rcu_read_lock();						rcu_read_lock();
	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	retval = -ESRCH;						retval = -ESRCH;
	if (!p)								if (!p)
		goto out_unlock;						goto out_unlock;

	retval = security_task_getscheduler(p);				retval = security_task_getscheduler(p);
	if (retval)							if (retval)
		goto out_unlock;						goto out_unlock;

	if (task_has_rt_policy(p))					if (task_has_rt_policy(p))
		lp.sched_priority = p->rt_priority;				lp.sched_priority = p->rt_priority;
	rcu_read_unlock();						rcu_read_unlock();

	/*								/*
	 * This one might sleep, we cannot do it with a spinl		 * This one might sleep, we cannot do it with a spinl
	 */								 */
	retval = copy_to_user(param, &lp, sizeof(*param)) ? -		retval = copy_to_user(param, &lp, sizeof(*param)) ? -

	return retval;							return retval;

out_unlock:							out_unlock:
	rcu_read_unlock();						rcu_read_unlock();
	return retval;							return retval;
}								}

/*								/*
 * Copy the kernel size attribute structure (which might be l	 * Copy the kernel size attribute structure (which might be l
 * than what user-space knows about) to user-space.		 * than what user-space knows about) to user-space.
 *								 *
 * Note that all cases are valid: user-space buffer can be la	 * Note that all cases are valid: user-space buffer can be la
 * smaller than the kernel-space buffer. The usual case is th	 * smaller than the kernel-space buffer. The usual case is th
 * have the same size.						 * have the same size.
 */								 */
static int							static int
sched_attr_copy_to_user(struct sched_attr __user *uattr,	sched_attr_copy_to_user(struct sched_attr __user *uattr,
			struct sched_attr *kattr,					struct sched_attr *kattr,
			unsigned int usize)						unsigned int usize)
{								{
	unsigned int ksize = sizeof(*kattr);				unsigned int ksize = sizeof(*kattr);

	if (!access_ok(uattr, usize))					if (!access_ok(uattr, usize))
		return -EFAULT;							return -EFAULT;

	/*								/*
	 * sched_getattr() ABI forwards and backwards compati		 * sched_getattr() ABI forwards and backwards compati
	 *								 *
	 * If usize == ksize then we just copy everything to 		 * If usize == ksize then we just copy everything to 
	 *								 *
	 * If usize < ksize then we only copy as much as user		 * If usize < ksize then we only copy as much as user
	 * this keeps ABI compatibility as well. We skip the 		 * this keeps ABI compatibility as well. We skip the 
	 *								 *
	 * If usize > ksize then user-space is using a newer 		 * If usize > ksize then user-space is using a newer 
	 * which part the kernel doesn't know about. Just ign		 * which part the kernel doesn't know about. Just ign
	 * detect the kernel's knowledge of attributes from t		 * detect the kernel's knowledge of attributes from t
	 * which is set to ksize in this case.				 * which is set to ksize in this case.
	 */								 */
	kattr->size = min(usize, ksize);				kattr->size = min(usize, ksize);

	if (copy_to_user(uattr, kattr, kattr->size))			if (copy_to_user(uattr, kattr, kattr->size))
		return -EFAULT;							return -EFAULT;

	return 0;							return 0;
}								}

/**								/**
 * sys_sched_getattr - similar to sched_getparam, but with sc	 * sys_sched_getattr - similar to sched_getparam, but with sc
 * @pid: the pid in question.					 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.	 * @uattr: structure containing the extended parameters.
 * @usize: sizeof(attr) for fwd/bwd comp.			 * @usize: sizeof(attr) for fwd/bwd comp.
 * @flags: for future extension.				 * @flags: for future extension.
 */								 */
SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr 	SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr 
		unsigned int, usize, unsigned int, flags)			unsigned int, usize, unsigned int, flags)
{								{
	struct sched_attr kattr = { };					struct sched_attr kattr = { };
	struct task_struct *p;						struct task_struct *p;
	int retval;							int retval;

	if (!uattr || pid < 0 || usize > PAGE_SIZE ||			if (!uattr || pid < 0 || usize > PAGE_SIZE ||
	    usize < SCHED_ATTR_SIZE_VER0 || flags)			    usize < SCHED_ATTR_SIZE_VER0 || flags)
		return -EINVAL;							return -EINVAL;

	rcu_read_lock();						rcu_read_lock();
	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	retval = -ESRCH;						retval = -ESRCH;
	if (!p)								if (!p)
		goto out_unlock;						goto out_unlock;

	retval = security_task_getscheduler(p);				retval = security_task_getscheduler(p);
	if (retval)							if (retval)
		goto out_unlock;						goto out_unlock;

	kattr.sched_policy = p->policy;					kattr.sched_policy = p->policy;
	if (p->sched_reset_on_fork)					if (p->sched_reset_on_fork)
		kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK			kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK
	get_params(p, &kattr);						get_params(p, &kattr);
	kattr.sched_flags &= SCHED_FLAG_ALL;				kattr.sched_flags &= SCHED_FLAG_ALL;

#ifdef CONFIG_UCLAMP_TASK					#ifdef CONFIG_UCLAMP_TASK
	/*								/*
	 * This could race with another potential updater, bu		 * This could race with another potential updater, bu
	 * because it'll correctly read the old or the new va		 * because it'll correctly read the old or the new va
	 * to guarantee who wins the race as long as it doesn		 * to guarantee who wins the race as long as it doesn
	 */								 */
	kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].valu		kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].valu
	kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].valu		kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].valu
#endif								#endif

	rcu_read_unlock();						rcu_read_unlock();

	return sched_attr_copy_to_user(uattr, &kattr, usize);		return sched_attr_copy_to_user(uattr, &kattr, usize);

out_unlock:							out_unlock:
	rcu_read_unlock();						rcu_read_unlock();
	return retval;							return retval;
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
int dl_task_check_affinity(struct task_struct *p, const struc	int dl_task_check_affinity(struct task_struct *p, const struc
{								{
	int ret = 0;							int ret = 0;

	/*								/*
	 * If the task isn't a deadline task or admission con		 * If the task isn't a deadline task or admission con
	 * disabled then we don't care about affinity changes		 * disabled then we don't care about affinity changes
	 */								 */
	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()		if (!task_has_dl_policy(p) || !dl_bandwidth_enabled()
		return 0;							return 0;

	/*								/*
	 * Since bandwidth control happens on root_domain bas		 * Since bandwidth control happens on root_domain bas
	 * if admission test is enabled, we only admit -deadl		 * if admission test is enabled, we only admit -deadl
	 * tasks allowed to run on all the CPUs in the task's		 * tasks allowed to run on all the CPUs in the task's
	 * root_domain.							 * root_domain.
	 */								 */
	rcu_read_lock();						rcu_read_lock();
	if (!cpumask_subset(task_rq(p)->rd->span, mask))		if (!cpumask_subset(task_rq(p)->rd->span, mask))
		ret = -EBUSY;							ret = -EBUSY;
	rcu_read_unlock();						rcu_read_unlock();
	return ret;							return ret;
}								}
#endif								#endif

static int							static int
__sched_setaffinity(struct task_struct *p, struct affinity_co	__sched_setaffinity(struct task_struct *p, struct affinity_co
{								{
	int retval;							int retval;
	cpumask_var_t cpus_allowed, new_mask;				cpumask_var_t cpus_allowed, new_mask;

	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))		if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
		return -ENOMEM;							return -ENOMEM;

	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {		if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
		retval = -ENOMEM;						retval = -ENOMEM;
		goto out_free_cpus_allowed;					goto out_free_cpus_allowed;
	}								}

	cpuset_cpus_allowed(p, cpus_allowed);				cpuset_cpus_allowed(p, cpus_allowed);
	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);		cpumask_and(new_mask, ctx->new_mask, cpus_allowed);

	ctx->new_mask = new_mask;					ctx->new_mask = new_mask;
	ctx->flags |= SCA_CHECK;					ctx->flags |= SCA_CHECK;

	retval = dl_task_check_affinity(p, new_mask);			retval = dl_task_check_affinity(p, new_mask);
	if (retval)							if (retval)
		goto out_free_new_mask;						goto out_free_new_mask;

	retval = __set_cpus_allowed_ptr(p, ctx);			retval = __set_cpus_allowed_ptr(p, ctx);
	if (retval)							if (retval)
		goto out_free_new_mask;						goto out_free_new_mask;

	cpuset_cpus_allowed(p, cpus_allowed);				cpuset_cpus_allowed(p, cpus_allowed);
	if (!cpumask_subset(new_mask, cpus_allowed)) {			if (!cpumask_subset(new_mask, cpus_allowed)) {
		/*								/*
		 * We must have raced with a concurrent cpuse			 * We must have raced with a concurrent cpuse
		 * Just reset the cpumask to the cpuset's cpu			 * Just reset the cpumask to the cpuset's cpu
		 */								 */
		cpumask_copy(new_mask, cpus_allowed);				cpumask_copy(new_mask, cpus_allowed);

		/*								/*
		 * If SCA_USER is set, a 2nd call to __set_cp			 * If SCA_USER is set, a 2nd call to __set_cp
		 * will restore the previous user_cpus_ptr va			 * will restore the previous user_cpus_ptr va
		 *								 *
		 * In the unlikely event a previous user_cpus			 * In the unlikely event a previous user_cpus
		 * we need to further restrict the mask to wh			 * we need to further restrict the mask to wh
		 * by that old user_cpus_ptr.					 * by that old user_cpus_ptr.
		 */								 */
		if (unlikely((ctx->flags & SCA_USER) && ctx->			if (unlikely((ctx->flags & SCA_USER) && ctx->
			bool empty = !cpumask_and(new_mask, n				bool empty = !cpumask_and(new_mask, n
						  ctx->user_m							  ctx->user_m

			if (WARN_ON_ONCE(empty))					if (WARN_ON_ONCE(empty))
				cpumask_copy(new_mask, cpus_a					cpumask_copy(new_mask, cpus_a
		}								}
		__set_cpus_allowed_ptr(p, ctx);					__set_cpus_allowed_ptr(p, ctx);
		retval = -EINVAL;						retval = -EINVAL;
	}								}

out_free_new_mask:						out_free_new_mask:
	free_cpumask_var(new_mask);					free_cpumask_var(new_mask);
out_free_cpus_allowed:						out_free_cpus_allowed:
	free_cpumask_var(cpus_allowed);					free_cpumask_var(cpus_allowed);
	return retval;							return retval;
}								}

long sched_setaffinity(pid_t pid, const struct cpumask *in_ma	long sched_setaffinity(pid_t pid, const struct cpumask *in_ma
{								{
	struct affinity_context ac;					struct affinity_context ac;
	struct cpumask *user_mask;					struct cpumask *user_mask;
	struct task_struct *p;						struct task_struct *p;
	int retval;							int retval;

	rcu_read_lock();						rcu_read_lock();

	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	if (!p) {							if (!p) {
		rcu_read_unlock();						rcu_read_unlock();
		return -ESRCH;							return -ESRCH;
	}								}

	/* Prevent p going away */					/* Prevent p going away */
	get_task_struct(p);						get_task_struct(p);
	rcu_read_unlock();						rcu_read_unlock();

	if (p->flags & PF_NO_SETAFFINITY) {				if (p->flags & PF_NO_SETAFFINITY) {
		retval = -EINVAL;						retval = -EINVAL;
		goto out_put_task;						goto out_put_task;
	}								}

	if (!check_same_owner(p)) {					if (!check_same_owner(p)) {
		rcu_read_lock();						rcu_read_lock();
		if (!ns_capable(__task_cred(p)->user_ns, CAP_			if (!ns_capable(__task_cred(p)->user_ns, CAP_
			rcu_read_unlock();						rcu_read_unlock();
			retval = -EPERM;						retval = -EPERM;
			goto out_put_task;						goto out_put_task;
		}								}
		rcu_read_unlock();						rcu_read_unlock();
	}								}

	retval = security_task_setscheduler(p);				retval = security_task_setscheduler(p);
	if (retval)							if (retval)
		goto out_put_task;						goto out_put_task;

	/*								/*
	 * With non-SMP configs, user_cpus_ptr/user_mask isn'		 * With non-SMP configs, user_cpus_ptr/user_mask isn'
	 * alloc_user_cpus_ptr() returns NULL.				 * alloc_user_cpus_ptr() returns NULL.
	 */								 */
	user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);			user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
	if (user_mask) {						if (user_mask) {
		cpumask_copy(user_mask, in_mask);				cpumask_copy(user_mask, in_mask);
	} else if (IS_ENABLED(CONFIG_SMP)) {				} else if (IS_ENABLED(CONFIG_SMP)) {
		retval = -ENOMEM;						retval = -ENOMEM;
		goto out_put_task;						goto out_put_task;
	}								}

	ac = (struct affinity_context){					ac = (struct affinity_context){
		.new_mask  = in_mask,						.new_mask  = in_mask,
		.user_mask = user_mask,						.user_mask = user_mask,
		.flags     = SCA_USER,						.flags     = SCA_USER,
	};								};

	retval = __sched_setaffinity(p, &ac);				retval = __sched_setaffinity(p, &ac);
	kfree(ac.user_mask);						kfree(ac.user_mask);

out_put_task:							out_put_task:
	put_task_struct(p);						put_task_struct(p);
	return retval;							return retval;
}								}

static int get_user_cpu_mask(unsigned long __user *user_mask_	static int get_user_cpu_mask(unsigned long __user *user_mask_
			     struct cpumask *new_mask)					     struct cpumask *new_mask)
{								{
	if (len < cpumask_size())					if (len < cpumask_size())
		cpumask_clear(new_mask);					cpumask_clear(new_mask);
	else if (len > cpumask_size())					else if (len > cpumask_size())
		len = cpumask_size();						len = cpumask_size();

	return copy_from_user(new_mask, user_mask_ptr, len) ?		return copy_from_user(new_mask, user_mask_ptr, len) ?
}								}

/**								/**
 * sys_sched_setaffinity - set the CPU affinity of a process	 * sys_sched_setaffinity - set the CPU affinity of a process
 * @pid: pid of the process					 * @pid: pid of the process
 * @len: length in bytes of the bitmask pointed to by user_ma	 * @len: length in bytes of the bitmask pointed to by user_ma
 * @user_mask_ptr: user-space pointer to the new CPU mask	 * @user_mask_ptr: user-space pointer to the new CPU mask
 *								 *
 * Return: 0 on success. An error code otherwise.		 * Return: 0 on success. An error code otherwise.
 */								 */
SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, 	SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, 
		unsigned long __user *, user_mask_ptr)				unsigned long __user *, user_mask_ptr)
{								{
	cpumask_var_t new_mask;						cpumask_var_t new_mask;
	int retval;							int retval;

	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))			if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
		return -ENOMEM;							return -ENOMEM;

	retval = get_user_cpu_mask(user_mask_ptr, len, new_ma		retval = get_user_cpu_mask(user_mask_ptr, len, new_ma
	if (retval == 0)						if (retval == 0)
		retval = sched_setaffinity(pid, new_mask);			retval = sched_setaffinity(pid, new_mask);
	free_cpumask_var(new_mask);					free_cpumask_var(new_mask);
	return retval;							return retval;
}								}

long sched_getaffinity(pid_t pid, struct cpumask *mask)		long sched_getaffinity(pid_t pid, struct cpumask *mask)
{								{
	struct task_struct *p;						struct task_struct *p;
	unsigned long flags;						unsigned long flags;
	int retval;							int retval;

	rcu_read_lock();						rcu_read_lock();

	retval = -ESRCH;						retval = -ESRCH;
	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	if (!p)								if (!p)
		goto out_unlock;						goto out_unlock;

	retval = security_task_getscheduler(p);				retval = security_task_getscheduler(p);
	if (retval)							if (retval)
		goto out_unlock;						goto out_unlock;

	raw_spin_lock_irqsave(&p->pi_lock, flags);			raw_spin_lock_irqsave(&p->pi_lock, flags);
	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);		cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);			raw_spin_unlock_irqrestore(&p->pi_lock, flags);

out_unlock:							out_unlock:
	rcu_read_unlock();						rcu_read_unlock();

	return retval;							return retval;
}								}

/**								/**
 * sys_sched_getaffinity - get the CPU affinity of a process	 * sys_sched_getaffinity - get the CPU affinity of a process
 * @pid: pid of the process					 * @pid: pid of the process
 * @len: length in bytes of the bitmask pointed to by user_ma	 * @len: length in bytes of the bitmask pointed to by user_ma
 * @user_mask_ptr: user-space pointer to hold the current CPU	 * @user_mask_ptr: user-space pointer to hold the current CPU
 *								 *
 * Return: size of CPU mask copied to user_mask_ptr on succes	 * Return: size of CPU mask copied to user_mask_ptr on succes
 * error code otherwise.					 * error code otherwise.
 */								 */
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, 	SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, 
		unsigned long __user *, user_mask_ptr)				unsigned long __user *, user_mask_ptr)
{								{
	int ret;							int ret;
	cpumask_var_t mask;						cpumask_var_t mask;

	if ((len * BITS_PER_BYTE) < nr_cpu_ids)				if ((len * BITS_PER_BYTE) < nr_cpu_ids)
		return -EINVAL;							return -EINVAL;
	if (len & (sizeof(unsigned long)-1))				if (len & (sizeof(unsigned long)-1))
		return -EINVAL;							return -EINVAL;

	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))			if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
		return -ENOMEM;							return -ENOMEM;

	ret = sched_getaffinity(pid, mask);				ret = sched_getaffinity(pid, mask);
	if (ret == 0) {							if (ret == 0) {
		unsigned int retlen = min(len, cpumask_size()			unsigned int retlen = min(len, cpumask_size()

		if (copy_to_user(user_mask_ptr, cpumask_bits(			if (copy_to_user(user_mask_ptr, cpumask_bits(
			ret = -EFAULT;							ret = -EFAULT;
		else								else
			ret = retlen;							ret = retlen;
	}								}
	free_cpumask_var(mask);						free_cpumask_var(mask);

	return ret;							return ret;
}								}

static void do_sched_yield(void)				static void do_sched_yield(void)
{								{
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	rq = this_rq_lock_irq(&rf);					rq = this_rq_lock_irq(&rf);

	schedstat_inc(rq->yld_count);					schedstat_inc(rq->yld_count);
	current->sched_class->yield_task(rq);				current->sched_class->yield_task(rq);

	preempt_disable();						preempt_disable();
	rq_unlock_irq(rq, &rf);						rq_unlock_irq(rq, &rf);
	sched_preempt_enable_no_resched();				sched_preempt_enable_no_resched();

	schedule();							schedule();
}								}

/**								/**
 * sys_sched_yield - yield the current processor to other thr	 * sys_sched_yield - yield the current processor to other thr
 *								 *
 * This function yields the current CPU to other tasks. If th	 * This function yields the current CPU to other tasks. If th
 * other threads running on this CPU then this function will 	 * other threads running on this CPU then this function will 
 *								 *
 * Return: 0.							 * Return: 0.
 */								 */
SYSCALL_DEFINE0(sched_yield)					SYSCALL_DEFINE0(sched_yield)
{								{
	do_sched_yield();						do_sched_yield();
	return 0;							return 0;
}								}

#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYN	#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYN
int __sched __cond_resched(void)				int __sched __cond_resched(void)
{								{
	if (should_resched(0)) {					if (should_resched(0)) {
		preempt_schedule_common();					preempt_schedule_common();
		return 1;							return 1;
	}								}
	/*								/*
	 * In preemptible kernels, ->rcu_read_lock_nesting te		 * In preemptible kernels, ->rcu_read_lock_nesting te
	 * whether the current CPU is in an RCU read-side cri		 * whether the current CPU is in an RCU read-side cri
	 * so the tick can report quiescent states even for C		 * so the tick can report quiescent states even for C
	 * in kernel context.  In contrast, in non-preemptibl		 * in kernel context.  In contrast, in non-preemptibl
	 * RCU readers leave no in-memory hints, which means 		 * RCU readers leave no in-memory hints, which means 
	 * processes executing in kernel context might never 		 * processes executing in kernel context might never 
	 * RCU quiescent state.  Therefore, the following cod		 * RCU quiescent state.  Therefore, the following cod
	 * cond_resched() to report a quiescent state, but on		 * cond_resched() to report a quiescent state, but on
	 * is in urgent need of one.					 * is in urgent need of one.
	 */								 */
#ifndef CONFIG_PREEMPT_RCU					#ifndef CONFIG_PREEMPT_RCU
	rcu_all_qs();							rcu_all_qs();
#endif								#endif
	return 0;							return 0;
}								}
EXPORT_SYMBOL(__cond_resched);					EXPORT_SYMBOL(__cond_resched);
#endif								#endif

#ifdef CONFIG_PREEMPT_DYNAMIC					#ifdef CONFIG_PREEMPT_DYNAMIC
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)			#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define cond_resched_dynamic_enabled	__cond_resched		#define cond_resched_dynamic_enabled	__cond_resched
#define cond_resched_dynamic_disabled	((void *)&__static_ca	#define cond_resched_dynamic_disabled	((void *)&__static_ca
DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);		DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(cond_resched);				EXPORT_STATIC_CALL_TRAMP(cond_resched);

#define might_resched_dynamic_enabled	__cond_resched		#define might_resched_dynamic_enabled	__cond_resched
#define might_resched_dynamic_disabled	((void *)&__static_ca	#define might_resched_dynamic_disabled	((void *)&__static_ca
DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);		DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
EXPORT_STATIC_CALL_TRAMP(might_resched);			EXPORT_STATIC_CALL_TRAMP(might_resched);
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)			#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);	static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)				int __sched dynamic_cond_resched(void)
{								{
	klp_sched_try_switch();						klp_sched_try_switch();
	if (!static_branch_unlikely(&sk_dynamic_cond_resched)		if (!static_branch_unlikely(&sk_dynamic_cond_resched)
		return 0;							return 0;
	return __cond_resched();					return __cond_resched();
}								}
EXPORT_SYMBOL(dynamic_cond_resched);				EXPORT_SYMBOL(dynamic_cond_resched);

static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);	static DEFINE_STATIC_KEY_FALSE(sk_dynamic_might_resched);
int __sched dynamic_might_resched(void)				int __sched dynamic_might_resched(void)
{								{
	if (!static_branch_unlikely(&sk_dynamic_might_resched		if (!static_branch_unlikely(&sk_dynamic_might_resched
		return 0;							return 0;
	return __cond_resched();					return __cond_resched();
}								}
EXPORT_SYMBOL(dynamic_might_resched);				EXPORT_SYMBOL(dynamic_might_resched);
#endif								#endif
#endif								#endif

/*								/*
 * __cond_resched_lock() - if a reschedule is pending, drop t	 * __cond_resched_lock() - if a reschedule is pending, drop t
 * call schedule, and on return reacquire the lock.		 * call schedule, and on return reacquire the lock.
 *								 *
 * This works OK both with and without CONFIG_PREEMPTION. We 	 * This works OK both with and without CONFIG_PREEMPTION. We 
 * operations here to prevent schedule() from being called tw	 * operations here to prevent schedule() from being called tw
 * spin_unlock(), once by hand).				 * spin_unlock(), once by hand).
 */								 */
int __cond_resched_lock(spinlock_t *lock)			int __cond_resched_lock(spinlock_t *lock)
{								{
	int resched = should_resched(PREEMPT_LOCK_OFFSET);		int resched = should_resched(PREEMPT_LOCK_OFFSET);
	int ret = 0;							int ret = 0;

	lockdep_assert_held(lock);					lockdep_assert_held(lock);

	if (spin_needbreak(lock) || resched) {				if (spin_needbreak(lock) || resched) {
		spin_unlock(lock);						spin_unlock(lock);
		if (!_cond_resched())						if (!_cond_resched())
			cpu_relax();							cpu_relax();
		ret = 1;							ret = 1;
		spin_lock(lock);						spin_lock(lock);
	}								}
	return ret;							return ret;
}								}
EXPORT_SYMBOL(__cond_resched_lock);				EXPORT_SYMBOL(__cond_resched_lock);

int __cond_resched_rwlock_read(rwlock_t *lock)			int __cond_resched_rwlock_read(rwlock_t *lock)
{								{
	int resched = should_resched(PREEMPT_LOCK_OFFSET);		int resched = should_resched(PREEMPT_LOCK_OFFSET);
	int ret = 0;							int ret = 0;

	lockdep_assert_held_read(lock);					lockdep_assert_held_read(lock);

	if (rwlock_needbreak(lock) || resched) {			if (rwlock_needbreak(lock) || resched) {
		read_unlock(lock);						read_unlock(lock);
		if (!_cond_resched())						if (!_cond_resched())
			cpu_relax();							cpu_relax();
		ret = 1;							ret = 1;
		read_lock(lock);						read_lock(lock);
	}								}
	return ret;							return ret;
}								}
EXPORT_SYMBOL(__cond_resched_rwlock_read);			EXPORT_SYMBOL(__cond_resched_rwlock_read);

int __cond_resched_rwlock_write(rwlock_t *lock)			int __cond_resched_rwlock_write(rwlock_t *lock)
{								{
	int resched = should_resched(PREEMPT_LOCK_OFFSET);		int resched = should_resched(PREEMPT_LOCK_OFFSET);
	int ret = 0;							int ret = 0;

	lockdep_assert_held_write(lock);				lockdep_assert_held_write(lock);

	if (rwlock_needbreak(lock) || resched) {			if (rwlock_needbreak(lock) || resched) {
		write_unlock(lock);						write_unlock(lock);
		if (!_cond_resched())						if (!_cond_resched())
			cpu_relax();							cpu_relax();
		ret = 1;							ret = 1;
		write_lock(lock);						write_lock(lock);
	}								}
	return ret;							return ret;
}								}
EXPORT_SYMBOL(__cond_resched_rwlock_write);			EXPORT_SYMBOL(__cond_resched_rwlock_write);

#ifdef CONFIG_PREEMPT_DYNAMIC					#ifdef CONFIG_PREEMPT_DYNAMIC

#ifdef CONFIG_GENERIC_ENTRY					#ifdef CONFIG_GENERIC_ENTRY
#include <linux/entry-common.h>					#include <linux/entry-common.h>
#endif								#endif

/*								/*
 * SC:cond_resched						 * SC:cond_resched
 * SC:might_resched						 * SC:might_resched
 * SC:preempt_schedule						 * SC:preempt_schedule
 * SC:preempt_schedule_notrace					 * SC:preempt_schedule_notrace
 * SC:irqentry_exit_cond_resched				 * SC:irqentry_exit_cond_resched
 *								 *
 *								 *
 * NONE:							 * NONE:
 *   cond_resched               <- __cond_resched		 *   cond_resched               <- __cond_resched
 *   might_resched              <- RET0				 *   might_resched              <- RET0
 *   preempt_schedule           <- NOP				 *   preempt_schedule           <- NOP
 *   preempt_schedule_notrace   <- NOP				 *   preempt_schedule_notrace   <- NOP
 *   irqentry_exit_cond_resched <- NOP				 *   irqentry_exit_cond_resched <- NOP
 *								 *
 * VOLUNTARY:							 * VOLUNTARY:
 *   cond_resched               <- __cond_resched		 *   cond_resched               <- __cond_resched
 *   might_resched              <- __cond_resched		 *   might_resched              <- __cond_resched
 *   preempt_schedule           <- NOP				 *   preempt_schedule           <- NOP
 *   preempt_schedule_notrace   <- NOP				 *   preempt_schedule_notrace   <- NOP
 *   irqentry_exit_cond_resched <- NOP				 *   irqentry_exit_cond_resched <- NOP
 *								 *
 * FULL:							 * FULL:
 *   cond_resched               <- RET0				 *   cond_resched               <- RET0
 *   might_resched              <- RET0				 *   might_resched              <- RET0
 *   preempt_schedule           <- preempt_schedule		 *   preempt_schedule           <- preempt_schedule
 *   preempt_schedule_notrace   <- preempt_schedule_notrace	 *   preempt_schedule_notrace   <- preempt_schedule_notrace
 *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched	 *   irqentry_exit_cond_resched <- irqentry_exit_cond_resched
 */								 */

enum {								enum {
	preempt_dynamic_undefined = -1,					preempt_dynamic_undefined = -1,
	preempt_dynamic_none,						preempt_dynamic_none,
	preempt_dynamic_voluntary,					preempt_dynamic_voluntary,
	preempt_dynamic_full,						preempt_dynamic_full,
};								};

int preempt_dynamic_mode = preempt_dynamic_undefined;		int preempt_dynamic_mode = preempt_dynamic_undefined;

int sched_dynamic_mode(const char *str)				int sched_dynamic_mode(const char *str)
{								{
	if (!strcmp(str, "none"))					if (!strcmp(str, "none"))
		return preempt_dynamic_none;					return preempt_dynamic_none;

	if (!strcmp(str, "voluntary"))					if (!strcmp(str, "voluntary"))
		return preempt_dynamic_voluntary;				return preempt_dynamic_voluntary;

	if (!strcmp(str, "full"))					if (!strcmp(str, "full"))
		return preempt_dynamic_full;					return preempt_dynamic_full;

	return -EINVAL;							return -EINVAL;
}								}

#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)			#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define preempt_dynamic_enable(f)	static_call_update(f,	#define preempt_dynamic_enable(f)	static_call_update(f,
#define preempt_dynamic_disable(f)	static_call_update(f,	#define preempt_dynamic_disable(f)	static_call_update(f,
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)			#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
#define preempt_dynamic_enable(f)	static_key_enable(&sk	#define preempt_dynamic_enable(f)	static_key_enable(&sk
#define preempt_dynamic_disable(f)	static_key_disable(&s	#define preempt_dynamic_disable(f)	static_key_disable(&s
#else								#else
#error "Unsupported PREEMPT_DYNAMIC mechanism"			#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif								#endif

static DEFINE_MUTEX(sched_dynamic_mutex);			static DEFINE_MUTEX(sched_dynamic_mutex);
static bool klp_override;					static bool klp_override;

static void __sched_dynamic_update(int mode)			static void __sched_dynamic_update(int mode)
{								{
	/*								/*
	 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ev		 * Avoid {NONE,VOLUNTARY} -> FULL transitions from ev
	 * the ZERO state, which is invalid.				 * the ZERO state, which is invalid.
	 */								 */
	if (!klp_override)						if (!klp_override)
		preempt_dynamic_enable(cond_resched);				preempt_dynamic_enable(cond_resched);
	preempt_dynamic_enable(might_resched);				preempt_dynamic_enable(might_resched);
	preempt_dynamic_enable(preempt_schedule);			preempt_dynamic_enable(preempt_schedule);
	preempt_dynamic_enable(preempt_schedule_notrace);		preempt_dynamic_enable(preempt_schedule_notrace);
	preempt_dynamic_enable(irqentry_exit_cond_resched);		preempt_dynamic_enable(irqentry_exit_cond_resched);

	switch (mode) {							switch (mode) {
	case preempt_dynamic_none:					case preempt_dynamic_none:
		if (!klp_override)						if (!klp_override)
			preempt_dynamic_enable(cond_resched);				preempt_dynamic_enable(cond_resched);
		preempt_dynamic_disable(might_resched);				preempt_dynamic_disable(might_resched);
		preempt_dynamic_disable(preempt_schedule);			preempt_dynamic_disable(preempt_schedule);
		preempt_dynamic_disable(preempt_schedule_notr			preempt_dynamic_disable(preempt_schedule_notr
		preempt_dynamic_disable(irqentry_exit_cond_re			preempt_dynamic_disable(irqentry_exit_cond_re
		if (mode != preempt_dynamic_mode)				if (mode != preempt_dynamic_mode)
			pr_info("Dynamic Preempt: none\n");				pr_info("Dynamic Preempt: none\n");
		break;								break;

	case preempt_dynamic_voluntary:					case preempt_dynamic_voluntary:
		if (!klp_override)						if (!klp_override)
			preempt_dynamic_enable(cond_resched);				preempt_dynamic_enable(cond_resched);
		preempt_dynamic_enable(might_resched);				preempt_dynamic_enable(might_resched);
		preempt_dynamic_disable(preempt_schedule);			preempt_dynamic_disable(preempt_schedule);
		preempt_dynamic_disable(preempt_schedule_notr			preempt_dynamic_disable(preempt_schedule_notr
		preempt_dynamic_disable(irqentry_exit_cond_re			preempt_dynamic_disable(irqentry_exit_cond_re
		if (mode != preempt_dynamic_mode)				if (mode != preempt_dynamic_mode)
			pr_info("Dynamic Preempt: voluntary\n				pr_info("Dynamic Preempt: voluntary\n
		break;								break;

	case preempt_dynamic_full:					case preempt_dynamic_full:
		if (!klp_override)						if (!klp_override)
			preempt_dynamic_disable(cond_resched)				preempt_dynamic_disable(cond_resched)
		preempt_dynamic_disable(might_resched);				preempt_dynamic_disable(might_resched);
		preempt_dynamic_enable(preempt_schedule);			preempt_dynamic_enable(preempt_schedule);
		preempt_dynamic_enable(preempt_schedule_notra			preempt_dynamic_enable(preempt_schedule_notra
		preempt_dynamic_enable(irqentry_exit_cond_res			preempt_dynamic_enable(irqentry_exit_cond_res
		if (mode != preempt_dynamic_mode)				if (mode != preempt_dynamic_mode)
			pr_info("Dynamic Preempt: full\n");				pr_info("Dynamic Preempt: full\n");
		break;								break;
	}								}

	preempt_dynamic_mode = mode;					preempt_dynamic_mode = mode;
}								}

void sched_dynamic_update(int mode)				void sched_dynamic_update(int mode)
{								{
	mutex_lock(&sched_dynamic_mutex);				mutex_lock(&sched_dynamic_mutex);
	__sched_dynamic_update(mode);					__sched_dynamic_update(mode);
	mutex_unlock(&sched_dynamic_mutex);				mutex_unlock(&sched_dynamic_mutex);
}								}

#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL				#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL

static int klp_cond_resched(void)				static int klp_cond_resched(void)
{								{
	__klp_sched_try_switch();					__klp_sched_try_switch();
	return __cond_resched();					return __cond_resched();
}								}

void sched_dynamic_klp_enable(void)				void sched_dynamic_klp_enable(void)
{								{
	mutex_lock(&sched_dynamic_mutex);				mutex_lock(&sched_dynamic_mutex);

	klp_override = true;						klp_override = true;
	static_call_update(cond_resched, klp_cond_resched);		static_call_update(cond_resched, klp_cond_resched);

	mutex_unlock(&sched_dynamic_mutex);				mutex_unlock(&sched_dynamic_mutex);
}								}

void sched_dynamic_klp_disable(void)				void sched_dynamic_klp_disable(void)
{								{
	mutex_lock(&sched_dynamic_mutex);				mutex_lock(&sched_dynamic_mutex);

	klp_override = false;						klp_override = false;
	__sched_dynamic_update(preempt_dynamic_mode);			__sched_dynamic_update(preempt_dynamic_mode);

	mutex_unlock(&sched_dynamic_mutex);				mutex_unlock(&sched_dynamic_mutex);
}								}

#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */			#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */

static int __init setup_preempt_mode(char *str)			static int __init setup_preempt_mode(char *str)
{								{
	int mode = sched_dynamic_mode(str);				int mode = sched_dynamic_mode(str);
	if (mode < 0) {							if (mode < 0) {
		pr_warn("Dynamic Preempt: unsupported mode: %			pr_warn("Dynamic Preempt: unsupported mode: %
		return 0;							return 0;
	}								}

	sched_dynamic_update(mode);					sched_dynamic_update(mode);
	return 1;							return 1;
}								}
__setup("preempt=", setup_preempt_mode);			__setup("preempt=", setup_preempt_mode);

static void __init preempt_dynamic_init(void)			static void __init preempt_dynamic_init(void)
{								{
	if (preempt_dynamic_mode == preempt_dynamic_undefined		if (preempt_dynamic_mode == preempt_dynamic_undefined
		if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {				if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
			sched_dynamic_update(preempt_dynamic_				sched_dynamic_update(preempt_dynamic_
		} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTAR			} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTAR
			sched_dynamic_update(preempt_dynamic_				sched_dynamic_update(preempt_dynamic_
		} else {							} else {
			/* Default static call setting, nothi				/* Default static call setting, nothi
			WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEM				WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEM
			preempt_dynamic_mode = preempt_dynami				preempt_dynamic_mode = preempt_dynami
			pr_info("Dynamic Preempt: full\n");				pr_info("Dynamic Preempt: full\n");
		}								}
	}								}
}								}

#define PREEMPT_MODEL_ACCESSOR(mode) \				#define PREEMPT_MODEL_ACCESSOR(mode) \
	bool preempt_model_##mode(void)					bool preempt_model_##mode(void)			
	{								{						
		WARN_ON_ONCE(preempt_dynamic_mode == preempt_			WARN_ON_ONCE(preempt_dynamic_mode == preempt_
		return preempt_dynamic_mode == preempt_dynami			return preempt_dynamic_mode == preempt_dynami
	}								}						
	EXPORT_SYMBOL_GPL(preempt_model_##mode)				EXPORT_SYMBOL_GPL(preempt_model_##mode)

PREEMPT_MODEL_ACCESSOR(none);					PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);				PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);					PREEMPT_MODEL_ACCESSOR(full);

#else /* !CONFIG_PREEMPT_DYNAMIC */				#else /* !CONFIG_PREEMPT_DYNAMIC */

static inline void preempt_dynamic_init(void) { }		static inline void preempt_dynamic_init(void) { }

#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */			#endif /* #ifdef CONFIG_PREEMPT_DYNAMIC */

/*							      <
 * task_is_pi_boosted - Check if task has been PI boosted.    <
 * @p:	Task to check.					      <
 *							      <
 * Return true if task is subject to priority inheritance.    <
 */							      <
bool task_is_pi_boosted(const struct task_struct *p)	      <
{							      <
	int prio = p->prio;				      <
							      <
	if (!rt_prio(prio))				      <
		return false;				      <
	return prio != p->normal_prio;			      <
}							      <
							      <
/**								/**
 * yield - yield the current processor to other threads.	 * yield - yield the current processor to other threads.
 *								 *
 * Do not ever use this function, there's a 99% chance you're	 * Do not ever use this function, there's a 99% chance you're
 *								 *
 * The scheduler is at all times free to pick the calling tas	 * The scheduler is at all times free to pick the calling tas
 * eligible task to run, if removing the yield() call from yo	 * eligible task to run, if removing the yield() call from yo
 * it, it's already broken.					 * it, it's already broken.
 *								 *
 * Typical broken usage is:					 * Typical broken usage is:
 *								 *
 * while (!event)						 * while (!event)
 *	yield();						 *	yield();
 *								 *
 * where one assumes that yield() will let 'the other' proces	 * where one assumes that yield() will let 'the other' proces
 * make event true. If the current task is a SCHED_FIFO task 	 * make event true. If the current task is a SCHED_FIFO task 
 * happen. Never use yield() as a progress guarantee!!		 * happen. Never use yield() as a progress guarantee!!
 *								 *
 * If you want to use yield() to wait for something, use wait	 * If you want to use yield() to wait for something, use wait
 * If you want to use yield() to be 'nice' for others, use co	 * If you want to use yield() to be 'nice' for others, use co
 * If you still want to use yield(), do not!			 * If you still want to use yield(), do not!
 */								 */
void __sched yield(void)					void __sched yield(void)
{								{
	set_current_state(TASK_RUNNING);				set_current_state(TASK_RUNNING);
	do_sched_yield();						do_sched_yield();
}								}
EXPORT_SYMBOL(yield);						EXPORT_SYMBOL(yield);

/**								/**
 * yield_to - yield the current processor to another thread i	 * yield_to - yield the current processor to another thread i
 * your thread group, or accelerate that thread toward the	 * your thread group, or accelerate that thread toward the
 * processor it's on.						 * processor it's on.
 * @p: target task						 * @p: target task
 * @preempt: whether task preemption is allowed or not		 * @preempt: whether task preemption is allowed or not
 *								 *
 * It's the caller's job to ensure that the target task struc	 * It's the caller's job to ensure that the target task struc
 * can't go away on us before we can do any checks.		 * can't go away on us before we can do any checks.
 *								 *
 * Return:							 * Return:
 *	true (>0) if we indeed boosted the target task.		 *	true (>0) if we indeed boosted the target task.
 *	false (0) if we failed to boost the target.		 *	false (0) if we failed to boost the target.
 *	-ESRCH if there's no task to yield to.			 *	-ESRCH if there's no task to yield to.
 */								 */
int __sched yield_to(struct task_struct *p, bool preempt)	int __sched yield_to(struct task_struct *p, bool preempt)
{								{
	struct task_struct *curr = current;				struct task_struct *curr = current;
	struct rq *rq, *p_rq;						struct rq *rq, *p_rq;
	unsigned long flags;						unsigned long flags;
	int yielded = 0;						int yielded = 0;

	local_irq_save(flags);						local_irq_save(flags);
	rq = this_rq();							rq = this_rq();

again:								again:
	p_rq = task_rq(p);						p_rq = task_rq(p);
	/*								/*
	 * If we're the only runnable task on the rq and targ		 * If we're the only runnable task on the rq and targ
	 * has only one task, there's absolutely no point in 		 * has only one task, there's absolutely no point in 
	 */								 */
	if (rq->nr_running == 1 && p_rq->nr_running == 1) {		if (rq->nr_running == 1 && p_rq->nr_running == 1) {
		yielded = -ESRCH;						yielded = -ESRCH;
		goto out_irq;							goto out_irq;
	}								}

	double_rq_lock(rq, p_rq);					double_rq_lock(rq, p_rq);
	if (task_rq(p) != p_rq) {					if (task_rq(p) != p_rq) {
		double_rq_unlock(rq, p_rq);					double_rq_unlock(rq, p_rq);
		goto again;							goto again;
	}								}

	if (!curr->sched_class->yield_to_task)				if (!curr->sched_class->yield_to_task)
		goto out_unlock;						goto out_unlock;

	if (curr->sched_class != p->sched_class)			if (curr->sched_class != p->sched_class)
		goto out_unlock;						goto out_unlock;

	if (task_on_cpu(p_rq, p) || !task_is_running(p))		if (task_on_cpu(p_rq, p) || !task_is_running(p))
		goto out_unlock;						goto out_unlock;

	yielded = curr->sched_class->yield_to_task(rq, p);		yielded = curr->sched_class->yield_to_task(rq, p);
	if (yielded) {							if (yielded) {
		schedstat_inc(rq->yld_count);					schedstat_inc(rq->yld_count);
		/*								/*
		 * Make p's CPU reschedule; pick_next_entity 			 * Make p's CPU reschedule; pick_next_entity 
		 * fairness.							 * fairness.
		 */								 */
		if (preempt && rq != p_rq)					if (preempt && rq != p_rq)
			resched_curr(p_rq);						resched_curr(p_rq);
	}								}

out_unlock:							out_unlock:
	double_rq_unlock(rq, p_rq);					double_rq_unlock(rq, p_rq);
out_irq:							out_irq:
	local_irq_restore(flags);					local_irq_restore(flags);

	if (yielded > 0)						if (yielded > 0)
		schedule();							schedule();

	return yielded;							return yielded;
}								}
EXPORT_SYMBOL_GPL(yield_to);					EXPORT_SYMBOL_GPL(yield_to);

int io_schedule_prepare(void)					int io_schedule_prepare(void)
{								{
	int old_iowait = current->in_iowait;				int old_iowait = current->in_iowait;

	current->in_iowait = 1;						current->in_iowait = 1;
	blk_flush_plug(current->plug, true);				blk_flush_plug(current->plug, true);
	return old_iowait;						return old_iowait;
}								}

void io_schedule_finish(int token)				void io_schedule_finish(int token)
{								{
	current->in_iowait = token;					current->in_iowait = token;
}								}

/*								/*
 * This task is about to go to sleep on IO. Increment rq->nr_	 * This task is about to go to sleep on IO. Increment rq->nr_
 * that process accounting knows that this is a task in IO wa	 * that process accounting knows that this is a task in IO wa
 */								 */
long __sched io_schedule_timeout(long timeout)			long __sched io_schedule_timeout(long timeout)
{								{
	int token;							int token;
	long ret;							long ret;

	token = io_schedule_prepare();					token = io_schedule_prepare();
	ret = schedule_timeout(timeout);				ret = schedule_timeout(timeout);
	io_schedule_finish(token);					io_schedule_finish(token);

	return ret;							return ret;
}								}
EXPORT_SYMBOL(io_schedule_timeout);				EXPORT_SYMBOL(io_schedule_timeout);

void __sched io_schedule(void)					void __sched io_schedule(void)
{								{
	int token;							int token;

	token = io_schedule_prepare();					token = io_schedule_prepare();
	schedule();							schedule();
	io_schedule_finish(token);					io_schedule_finish(token);
}								}
EXPORT_SYMBOL(io_schedule);					EXPORT_SYMBOL(io_schedule);

/**								/**
 * sys_sched_get_priority_max - return maximum RT priority.	 * sys_sched_get_priority_max - return maximum RT priority.
 * @policy: scheduling class.					 * @policy: scheduling class.
 *								 *
 * Return: On success, this syscall returns the maximum		 * Return: On success, this syscall returns the maximum
 * rt_priority that can be used by a given scheduling class.	 * rt_priority that can be used by a given scheduling class.
 * On failure, a negative error code is returned.		 * On failure, a negative error code is returned.
 */								 */
SYSCALL_DEFINE1(sched_get_priority_max, int, policy)		SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
{								{
	int ret = -EINVAL;						int ret = -EINVAL;

	switch (policy) {						switch (policy) {
	case SCHED_FIFO:						case SCHED_FIFO:
	case SCHED_RR:							case SCHED_RR:
		ret = MAX_RT_PRIO-1;						ret = MAX_RT_PRIO-1;
		break;								break;
	case SCHED_DEADLINE:						case SCHED_DEADLINE:
	case SCHED_NORMAL:						case SCHED_NORMAL:
	case SCHED_BATCH:						case SCHED_BATCH:
	case SCHED_IDLE:						case SCHED_IDLE:
		ret = 0;							ret = 0;
		break;								break;
	}								}
	return ret;							return ret;
}								}

/**								/**
 * sys_sched_get_priority_min - return minimum RT priority.	 * sys_sched_get_priority_min - return minimum RT priority.
 * @policy: scheduling class.					 * @policy: scheduling class.
 *								 *
 * Return: On success, this syscall returns the minimum		 * Return: On success, this syscall returns the minimum
 * rt_priority that can be used by a given scheduling class.	 * rt_priority that can be used by a given scheduling class.
 * On failure, a negative error code is returned.		 * On failure, a negative error code is returned.
 */								 */
SYSCALL_DEFINE1(sched_get_priority_min, int, policy)		SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
{								{
	int ret = -EINVAL;						int ret = -EINVAL;

	switch (policy) {						switch (policy) {
	case SCHED_FIFO:						case SCHED_FIFO:
	case SCHED_RR:							case SCHED_RR:
		ret = 1;							ret = 1;
		break;								break;
	case SCHED_DEADLINE:						case SCHED_DEADLINE:
	case SCHED_NORMAL:						case SCHED_NORMAL:
	case SCHED_BATCH:						case SCHED_BATCH:
	case SCHED_IDLE:						case SCHED_IDLE:
		ret = 0;							ret = 0;
	}								}
	return ret;							return ret;
}								}

static int sched_rr_get_interval(pid_t pid, struct timespec64	static int sched_rr_get_interval(pid_t pid, struct timespec64
{								{
	struct task_struct *p;						struct task_struct *p;
	unsigned int time_slice;					unsigned int time_slice;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;
	int retval;							int retval;

	if (pid < 0)							if (pid < 0)
		return -EINVAL;							return -EINVAL;

	retval = -ESRCH;						retval = -ESRCH;
	rcu_read_lock();						rcu_read_lock();
	p = find_process_by_pid(pid);					p = find_process_by_pid(pid);
	if (!p)								if (!p)
		goto out_unlock;						goto out_unlock;

	retval = security_task_getscheduler(p);				retval = security_task_getscheduler(p);
	if (retval)							if (retval)
		goto out_unlock;						goto out_unlock;

	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
	time_slice = 0;							time_slice = 0;
	if (p->sched_class->get_rr_interval)				if (p->sched_class->get_rr_interval)
		time_slice = p->sched_class->get_rr_interval(			time_slice = p->sched_class->get_rr_interval(
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);

	rcu_read_unlock();						rcu_read_unlock();
	jiffies_to_timespec64(time_slice, t);				jiffies_to_timespec64(time_slice, t);
	return 0;							return 0;

out_unlock:							out_unlock:
	rcu_read_unlock();						rcu_read_unlock();
	return retval;							return retval;
}								}

/**								/**
 * sys_sched_rr_get_interval - return the default timeslice o	 * sys_sched_rr_get_interval - return the default timeslice o
 * @pid: pid of the process.					 * @pid: pid of the process.
 * @interval: userspace pointer to the timeslice value.		 * @interval: userspace pointer to the timeslice value.
 *								 *
 * this syscall writes the default timeslice value of a given	 * this syscall writes the default timeslice value of a given
 * into the user-space timespec buffer. A value of '0' means 	 * into the user-space timespec buffer. A value of '0' means 
 *								 *
 * Return: On success, 0 and the timeslice is in @interval. O	 * Return: On success, 0 and the timeslice is in @interval. O
 * an error code.						 * an error code.
 */								 */
SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,		SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
		struct __kernel_timespec __user *, interval)			struct __kernel_timespec __user *, interval)
{								{
	struct timespec64 t;						struct timespec64 t;
	int retval = sched_rr_get_interval(pid, &t);			int retval = sched_rr_get_interval(pid, &t);

	if (retval == 0)						if (retval == 0)
		retval = put_timespec64(&t, interval);				retval = put_timespec64(&t, interval);

	return retval;							return retval;
}								}

#ifdef CONFIG_COMPAT_32BIT_TIME					#ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,	SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
		struct old_timespec32 __user *, interval)			struct old_timespec32 __user *, interval)
{								{
	struct timespec64 t;						struct timespec64 t;
	int retval = sched_rr_get_interval(pid, &t);			int retval = sched_rr_get_interval(pid, &t);

	if (retval == 0)						if (retval == 0)
		retval = put_old_timespec32(&t, interval);			retval = put_old_timespec32(&t, interval);
	return retval;							return retval;
}								}
#endif								#endif

void sched_show_task(struct task_struct *p)			void sched_show_task(struct task_struct *p)
{								{
	unsigned long free = 0;						unsigned long free = 0;
	int ppid;							int ppid;

	if (!try_get_task_stack(p))					if (!try_get_task_stack(p))
		return;								return;

	pr_info("task:%-15.15s state:%c", p->comm, task_state		pr_info("task:%-15.15s state:%c", p->comm, task_state

	if (task_is_running(p))						if (task_is_running(p))
		pr_cont("  running task    ");					pr_cont("  running task    ");
#ifdef CONFIG_DEBUG_STACK_USAGE					#ifdef CONFIG_DEBUG_STACK_USAGE
	free = stack_not_used(p);					free = stack_not_used(p);
#endif								#endif
	ppid = 0;							ppid = 0;
	rcu_read_lock();						rcu_read_lock();
	if (pid_alive(p))						if (pid_alive(p))
		ppid = task_pid_nr(rcu_dereference(p->real_pa			ppid = task_pid_nr(rcu_dereference(p->real_pa
	rcu_read_unlock();						rcu_read_unlock();
	pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08l		pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08l
		free, task_pid_nr(p), ppid,					free, task_pid_nr(p), ppid,
		read_task_thread_flags(p));					read_task_thread_flags(p));

	print_worker_info(KERN_INFO, p);				print_worker_info(KERN_INFO, p);
	print_stop_info(KERN_INFO, p);					print_stop_info(KERN_INFO, p);
	show_stack(p, NULL, KERN_INFO);					show_stack(p, NULL, KERN_INFO);
	put_task_stack(p);						put_task_stack(p);
}								}
EXPORT_SYMBOL_GPL(sched_show_task);				EXPORT_SYMBOL_GPL(sched_show_task);

static inline bool						static inline bool
state_filter_match(unsigned long state_filter, struct task_st	state_filter_match(unsigned long state_filter, struct task_st
{								{
	unsigned int state = READ_ONCE(p->__state);			unsigned int state = READ_ONCE(p->__state);

	/* no filter, everything matches */				/* no filter, everything matches */
	if (!state_filter)						if (!state_filter)
		return true;							return true;

	/* filter, but doesn't match */					/* filter, but doesn't match */
	if (!(state & state_filter))					if (!(state & state_filter))
		return false;							return false;

	/*								/*
	 * When looking for TASK_UNINTERRUPTIBLE skip TASK_ID		 * When looking for TASK_UNINTERRUPTIBLE skip TASK_ID
	 * TASK_KILLABLE).						 * TASK_KILLABLE).
	 */								 */
	if (state_filter == TASK_UNINTERRUPTIBLE && (state & 		if (state_filter == TASK_UNINTERRUPTIBLE && (state & 
		return false;							return false;

	return true;							return true;
}								}


void show_state_filter(unsigned int state_filter)		void show_state_filter(unsigned int state_filter)
{								{
	struct task_struct *g, *p;					struct task_struct *g, *p;

	rcu_read_lock();						rcu_read_lock();
	for_each_process_thread(g, p) {					for_each_process_thread(g, p) {
		/*								/*
		 * reset the NMI-timeout, listing all files o			 * reset the NMI-timeout, listing all files o
		 * console might take a lot of time:				 * console might take a lot of time:
		 * Also, reset softlockup watchdogs on all CP			 * Also, reset softlockup watchdogs on all CP
		 * another CPU might be blocked waiting for u			 * another CPU might be blocked waiting for u
		 * an IPI.							 * an IPI.
		 */								 */
		touch_nmi_watchdog();						touch_nmi_watchdog();
		touch_all_softlockup_watchdogs();				touch_all_softlockup_watchdogs();
		if (state_filter_match(state_filter, p))			if (state_filter_match(state_filter, p))
			sched_show_task(p);						sched_show_task(p);
	}								}

#ifdef CONFIG_SCHED_DEBUG					#ifdef CONFIG_SCHED_DEBUG
	if (!state_filter)						if (!state_filter)
		sysrq_sched_debug_show();					sysrq_sched_debug_show();
#endif								#endif
	rcu_read_unlock();						rcu_read_unlock();
	/*								/*
	 * Only show locks if all tasks are dumped:			 * Only show locks if all tasks are dumped:
	 */								 */
	if (!state_filter)						if (!state_filter)
		debug_show_all_locks();						debug_show_all_locks();
}								}

/**								/**
 * init_idle - set up an idle thread for a given CPU		 * init_idle - set up an idle thread for a given CPU
 * @idle: task in question					 * @idle: task in question
 * @cpu: CPU the idle task belongs to				 * @cpu: CPU the idle task belongs to
 *								 *
 * NOTE: this function does not set the idle thread's NEED_RE	 * NOTE: this function does not set the idle thread's NEED_RE
 * flag, to make booting more robust.				 * flag, to make booting more robust.
 */								 */
void __init init_idle(struct task_struct *idle, int cpu)	void __init init_idle(struct task_struct *idle, int cpu)
{								{
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	struct affinity_context ac = (struct affinity_context		struct affinity_context ac = (struct affinity_context
		.new_mask  = cpumask_of(cpu),					.new_mask  = cpumask_of(cpu),
		.flags     = 0,							.flags     = 0,
	};								};
#endif								#endif
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	unsigned long flags;						unsigned long flags;

	__sched_fork(0, idle);						__sched_fork(0, idle);

	raw_spin_lock_irqsave(&idle->pi_lock, flags);			raw_spin_lock_irqsave(&idle->pi_lock, flags);
	raw_spin_rq_lock(rq);						raw_spin_rq_lock(rq);

	idle->__state = TASK_RUNNING;					idle->__state = TASK_RUNNING;
	idle->se.exec_start = sched_clock();				idle->se.exec_start = sched_clock();
	/*								/*
	 * PF_KTHREAD should already be set at this point; re		 * PF_KTHREAD should already be set at this point; re
	 * look like a proper per-CPU kthread.				 * look like a proper per-CPU kthread.
	 */								 */
	idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;			idle->flags |= PF_KTHREAD | PF_NO_SETAFFINITY;
	kthread_set_per_cpu(idle, cpu);					kthread_set_per_cpu(idle, cpu);

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	/*								/*
	 * It's possible that init_idle() gets called multipl		 * It's possible that init_idle() gets called multipl
	 * in that case do_set_cpus_allowed() will not do the		 * in that case do_set_cpus_allowed() will not do the
	 *								 *
	 * And since this is boot we can forgo the serializat		 * And since this is boot we can forgo the serializat
	 */								 */
	set_cpus_allowed_common(idle, &ac);				set_cpus_allowed_common(idle, &ac);
#endif								#endif
	/*								/*
	 * We're having a chicken and egg problem, even thoug		 * We're having a chicken and egg problem, even thoug
	 * holding rq->lock, the CPU isn't yet set to this CP		 * holding rq->lock, the CPU isn't yet set to this CP
	 * lockdep check in task_group() will fail.			 * lockdep check in task_group() will fail.
	 *								 *
	 * Similar case to sched_fork(). / Alternatively we c		 * Similar case to sched_fork(). / Alternatively we c
	 * use task_rq_lock() here and obtain the other rq->l		 * use task_rq_lock() here and obtain the other rq->l
	 *								 *
	 * Silence PROVE_RCU						 * Silence PROVE_RCU
	 */								 */
	rcu_read_lock();						rcu_read_lock();
	__set_task_cpu(idle, cpu);					__set_task_cpu(idle, cpu);
	rcu_read_unlock();						rcu_read_unlock();

	rq->idle = idle;						rq->idle = idle;
	rcu_assign_pointer(rq->curr, idle);				rcu_assign_pointer(rq->curr, idle);
	idle->on_rq = TASK_ON_RQ_QUEUED;				idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	idle->on_cpu = 1;						idle->on_cpu = 1;
#endif								#endif
	raw_spin_rq_unlock(rq);						raw_spin_rq_unlock(rq);
	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);		raw_spin_unlock_irqrestore(&idle->pi_lock, flags);

	/* Set the preempt count _outside_ the spinlocks! */		/* Set the preempt count _outside_ the spinlocks! */
	init_idle_preempt_count(idle, cpu);				init_idle_preempt_count(idle, cpu);

	/*								/*
	 * The idle tasks have their own, simple scheduling c		 * The idle tasks have their own, simple scheduling c
	 */								 */
	idle->sched_class = &idle_sched_class;				idle->sched_class = &idle_sched_class;
	ftrace_graph_init_idle_task(idle, cpu);				ftrace_graph_init_idle_task(idle, cpu);
	vtime_init_idle(idle, cpu);					vtime_init_idle(idle, cpu);
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);		sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif								#endif
}								}

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP

int cpuset_cpumask_can_shrink(const struct cpumask *cur,	int cpuset_cpumask_can_shrink(const struct cpumask *cur,
			      const struct cpumask *trial)				      const struct cpumask *trial)
{								{
	int ret = 1;							int ret = 1;

	if (cpumask_empty(cur))						if (cpumask_empty(cur))
		return ret;							return ret;

	ret = dl_cpuset_cpumask_can_shrink(cur, trial);			ret = dl_cpuset_cpumask_can_shrink(cur, trial);

	return ret;							return ret;
}								}

int task_can_attach(struct task_struct *p)			int task_can_attach(struct task_struct *p)
{								{
	int ret = 0;							int ret = 0;

	/*								/*
	 * Kthreads which disallow setaffinity shouldn't be m		 * Kthreads which disallow setaffinity shouldn't be m
	 * to a new cpuset; we don't want to change their CPU		 * to a new cpuset; we don't want to change their CPU
	 * affinity and isolating such threads by their set o		 * affinity and isolating such threads by their set o
	 * allowed nodes is unnecessary.  Thus, cpusets are n		 * allowed nodes is unnecessary.  Thus, cpusets are n
	 * applicable for such threads.  This prevents checki		 * applicable for such threads.  This prevents checki
	 * success of set_cpus_allowed_ptr() on all attached 		 * success of set_cpus_allowed_ptr() on all attached 
	 * before cpus_mask may be changed.				 * before cpus_mask may be changed.
	 */								 */
	if (p->flags & PF_NO_SETAFFINITY)				if (p->flags & PF_NO_SETAFFINITY)
		ret = -EINVAL;							ret = -EINVAL;

	return ret;							return ret;
}								}

bool sched_smp_initialized __read_mostly;			bool sched_smp_initialized __read_mostly;

#ifdef CONFIG_NUMA_BALANCING					#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */			/* Migrate current task p to target_cpu */
int migrate_task_to(struct task_struct *p, int target_cpu)	int migrate_task_to(struct task_struct *p, int target_cpu)
{								{
	struct migration_arg arg = { p, target_cpu };			struct migration_arg arg = { p, target_cpu };
	int curr_cpu = task_cpu(p);					int curr_cpu = task_cpu(p);

	if (curr_cpu == target_cpu)					if (curr_cpu == target_cpu)
		return 0;							return 0;

	if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))			if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
		return -EINVAL;							return -EINVAL;

	/* TODO: This is not properly updating schedstats */		/* TODO: This is not properly updating schedstats */

	trace_sched_move_numa(p, curr_cpu, target_cpu);			trace_sched_move_numa(p, curr_cpu, target_cpu);
	return stop_one_cpu(curr_cpu, migration_cpu_stop, &ar		return stop_one_cpu(curr_cpu, migration_cpu_stop, &ar
}								}

/*								/*
 * Requeue a task on a given node and accurately track the nu	 * Requeue a task on a given node and accurately track the nu
 * tasks on the runqueues					 * tasks on the runqueues
 */								 */
void sched_setnuma(struct task_struct *p, int nid)		void sched_setnuma(struct task_struct *p, int nid)
{								{
	bool queued, running;						bool queued, running;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	rq = task_rq_lock(p, &rf);					rq = task_rq_lock(p, &rf);
	queued = task_on_rq_queued(p);					queued = task_on_rq_queued(p);
	running = task_current(rq, p);					running = task_current(rq, p);

	if (queued)							if (queued)
		dequeue_task(rq, p, DEQUEUE_SAVE);				dequeue_task(rq, p, DEQUEUE_SAVE);
	if (running)							if (running)
		put_prev_task(rq, p);						put_prev_task(rq, p);

	p->numa_preferred_nid = nid;					p->numa_preferred_nid = nid;

	if (queued)							if (queued)
		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE			enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE
	if (running)							if (running)
		set_next_task(rq, p);						set_next_task(rq, p);
	task_rq_unlock(rq, p, &rf);					task_rq_unlock(rq, p, &rf);
}								}
#endif /* CONFIG_NUMA_BALANCING */				#endif /* CONFIG_NUMA_BALANCING */

#ifdef CONFIG_HOTPLUG_CPU					#ifdef CONFIG_HOTPLUG_CPU
/*								/*
 * Ensure that the idle task is using init_mm right before it	 * Ensure that the idle task is using init_mm right before it
 * offline.							 * offline.
 */								 */
void idle_task_exit(void)					void idle_task_exit(void)
{								{
	struct mm_struct *mm = current->active_mm;			struct mm_struct *mm = current->active_mm;

	BUG_ON(cpu_online(smp_processor_id()));				BUG_ON(cpu_online(smp_processor_id()));
	BUG_ON(current != this_rq()->idle);				BUG_ON(current != this_rq()->idle);

	if (mm != &init_mm) {						if (mm != &init_mm) {
		switch_mm(mm, &init_mm, current);				switch_mm(mm, &init_mm, current);
		finish_arch_post_lock_switch();					finish_arch_post_lock_switch();
	}								}

	/* finish_cpu(), as ran on the BP, will clean up the 		/* finish_cpu(), as ran on the BP, will clean up the 
}								}

static int __balance_push_cpu_stop(void *arg)			static int __balance_push_cpu_stop(void *arg)
{								{
	struct task_struct *p = arg;					struct task_struct *p = arg;
	struct rq *rq = this_rq();					struct rq *rq = this_rq();
	struct rq_flags rf;						struct rq_flags rf;
	int cpu;							int cpu;

	raw_spin_lock_irq(&p->pi_lock);					raw_spin_lock_irq(&p->pi_lock);
	rq_lock(rq, &rf);						rq_lock(rq, &rf);

	update_rq_clock(rq);						update_rq_clock(rq);

	if (task_rq(p) == rq && task_on_rq_queued(p)) {			if (task_rq(p) == rq && task_on_rq_queued(p)) {
		cpu = select_fallback_rq(rq->cpu, p);				cpu = select_fallback_rq(rq->cpu, p);
		rq = __migrate_task(rq, &rf, p, cpu);				rq = __migrate_task(rq, &rf, p, cpu);
	}								}

	rq_unlock(rq, &rf);						rq_unlock(rq, &rf);
	raw_spin_unlock_irq(&p->pi_lock);				raw_spin_unlock_irq(&p->pi_lock);

	put_task_struct(p);						put_task_struct(p);

	return 0;							return 0;
}								}

static DEFINE_PER_CPU(struct cpu_stop_work, push_work);		static DEFINE_PER_CPU(struct cpu_stop_work, push_work);

/*								/*
 * Ensure we only run per-cpu kthreads once the CPU goes !act	 * Ensure we only run per-cpu kthreads once the CPU goes !act
 *								 *
 * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(),	 * This is enabled below SCHED_AP_ACTIVE; when !cpu_active(),
 * effective when the hotplug motion is down.			 * effective when the hotplug motion is down.
 */								 */
static void balance_push(struct rq *rq)				static void balance_push(struct rq *rq)
{								{
	struct task_struct *push_task = rq->curr;			struct task_struct *push_task = rq->curr;

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	/*								/*
	 * Ensure the thing is persistent until balance_push_		 * Ensure the thing is persistent until balance_push_
	 */								 */
	rq->balance_callback = &balance_push_callback;			rq->balance_callback = &balance_push_callback;

	/*								/*
	 * Only active while going offline and when invoked o		 * Only active while going offline and when invoked o
	 * CPU.								 * CPU.
	 */								 */
	if (!cpu_dying(rq->cpu) || rq != this_rq())			if (!cpu_dying(rq->cpu) || rq != this_rq())
		return;								return;

	/*								/*
	 * Both the cpu-hotplug and stop task are in this cas		 * Both the cpu-hotplug and stop task are in this cas
	 * required to complete the hotplug process.			 * required to complete the hotplug process.
	 */								 */
	if (kthread_is_per_cpu(push_task) ||				if (kthread_is_per_cpu(push_task) ||
	    is_migration_disabled(push_task)) {				    is_migration_disabled(push_task)) {

		/*								/*
		 * If this is the idle task on the outgoing C			 * If this is the idle task on the outgoing C
		 * up the hotplug control thread which might 			 * up the hotplug control thread which might 
		 * last task to vanish. The rcuwait_active() 			 * last task to vanish. The rcuwait_active() 
		 * accurate here because the waiter is pinned			 * accurate here because the waiter is pinned
		 * and can't obviously be running in parallel			 * and can't obviously be running in parallel
		 *								 *
		 * On RT kernels this also has to check wheth			 * On RT kernels this also has to check wheth
		 * pinned and scheduled out tasks on the runq			 * pinned and scheduled out tasks on the runq
		 * need to leave the migrate disabled section			 * need to leave the migrate disabled section
		 */								 */
		if (!rq->nr_running && !rq_has_pinned_tasks(r			if (!rq->nr_running && !rq_has_pinned_tasks(r
		    rcuwait_active(&rq->hotplug_wait)) {			    rcuwait_active(&rq->hotplug_wait)) {
			raw_spin_rq_unlock(rq);						raw_spin_rq_unlock(rq);
			rcuwait_wake_up(&rq->hotplug_wait);				rcuwait_wake_up(&rq->hotplug_wait);
			raw_spin_rq_lock(rq);						raw_spin_rq_lock(rq);
		}								}
		return;								return;
	}								}

	get_task_struct(push_task);					get_task_struct(push_task);
	/*								/*
	 * Temporarily drop rq->lock such that we can wake-up		 * Temporarily drop rq->lock such that we can wake-up
	 * Both preemption and IRQs are still disabled.			 * Both preemption and IRQs are still disabled.
	 */								 */
	preempt_disable();						preempt_disable();
	raw_spin_rq_unlock(rq);						raw_spin_rq_unlock(rq);
	stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop,		stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop,
			    this_cpu_ptr(&push_work));					    this_cpu_ptr(&push_work));
	preempt_enable();						preempt_enable();
	/*								/*
	 * At this point need_resched() is true and we'll tak		 * At this point need_resched() is true and we'll tak
	 * schedule(). The next pick is obviously going to be		 * schedule(). The next pick is obviously going to be
	 * which kthread_is_per_cpu() and will push this task		 * which kthread_is_per_cpu() and will push this task
	 */								 */
	raw_spin_rq_lock(rq);						raw_spin_rq_lock(rq);
}								}

static void balance_push_set(int cpu, bool on)			static void balance_push_set(int cpu, bool on)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct rq_flags rf;						struct rq_flags rf;

	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	if (on) {							if (on) {
		WARN_ON_ONCE(rq->balance_callback);				WARN_ON_ONCE(rq->balance_callback);
		rq->balance_callback = &balance_push_callback			rq->balance_callback = &balance_push_callback
	} else if (rq->balance_callback == &balance_push_call		} else if (rq->balance_callback == &balance_push_call
		rq->balance_callback = NULL;					rq->balance_callback = NULL;
	}								}
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);
}								}

/*								/*
 * Invoked from a CPUs hotplug control thread after the CPU h	 * Invoked from a CPUs hotplug control thread after the CPU h
 * inactive. All tasks which are not per CPU kernel threads a	 * inactive. All tasks which are not per CPU kernel threads a
 * pushed off this CPU now via balance_push() or placed on a 	 * pushed off this CPU now via balance_push() or placed on a 
 * during wakeup. Wait until the CPU is quiescent.		 * during wakeup. Wait until the CPU is quiescent.
 */								 */
static void balance_hotplug_wait(void)				static void balance_hotplug_wait(void)
{								{
	struct rq *rq = this_rq();					struct rq *rq = this_rq();

	rcuwait_wait_event(&rq->hotplug_wait,				rcuwait_wait_event(&rq->hotplug_wait,
			   rq->nr_running == 1 && !rq_has_pin				   rq->nr_running == 1 && !rq_has_pin
			   TASK_UNINTERRUPTIBLE);					   TASK_UNINTERRUPTIBLE);
}								}

#else								#else

static inline void balance_push(struct rq *rq)			static inline void balance_push(struct rq *rq)
{								{
}								}

static inline void balance_push_set(int cpu, bool on)		static inline void balance_push_set(int cpu, bool on)
{								{
}								}

static inline void balance_hotplug_wait(void)			static inline void balance_hotplug_wait(void)
{								{
}								}

#endif /* CONFIG_HOTPLUG_CPU */					#endif /* CONFIG_HOTPLUG_CPU */

void set_rq_online(struct rq *rq)				void set_rq_online(struct rq *rq)
{								{
	if (!rq->online) {						if (!rq->online) {
		const struct sched_class *class;				const struct sched_class *class;

		cpumask_set_cpu(rq->cpu, rq->rd->online);			cpumask_set_cpu(rq->cpu, rq->rd->online);
		rq->online = 1;							rq->online = 1;

		for_each_class(class) {						for_each_class(class) {
			if (class->rq_online)						if (class->rq_online)
				class->rq_online(rq);						class->rq_online(rq);
		}								}
	}								}
}								}

void set_rq_offline(struct rq *rq)				void set_rq_offline(struct rq *rq)
{								{
	if (rq->online) {						if (rq->online) {
		const struct sched_class *class;				const struct sched_class *class;

		update_rq_clock(rq);						update_rq_clock(rq);
		for_each_class(class) {						for_each_class(class) {
			if (class->rq_offline)						if (class->rq_offline)
				class->rq_offline(rq);						class->rq_offline(rq);
		}								}

		cpumask_clear_cpu(rq->cpu, rq->rd->online);			cpumask_clear_cpu(rq->cpu, rq->rd->online);
		rq->online = 0;							rq->online = 0;
	}								}
}								}

/*								/*
 * used to mark begin/end of suspend/resume:			 * used to mark begin/end of suspend/resume:
 */								 */
static int num_cpus_frozen;					static int num_cpus_frozen;

/*								/*
 * Update cpusets according to cpu_active mask.  If cpusets a	 * Update cpusets according to cpu_active mask.  If cpusets a
 * disabled, cpuset_update_active_cpus() becomes a simple wra	 * disabled, cpuset_update_active_cpus() becomes a simple wra
 * around partition_sched_domains().				 * around partition_sched_domains().
 *								 *
 * If we come here as part of a suspend/resume, don't touch c	 * If we come here as part of a suspend/resume, don't touch c
 * want to restore it back to its original state upon resume 	 * want to restore it back to its original state upon resume 
 */								 */
static void cpuset_cpu_active(void)				static void cpuset_cpu_active(void)
{								{
	if (cpuhp_tasks_frozen) {					if (cpuhp_tasks_frozen) {
		/*								/*
		 * num_cpus_frozen tracks how many CPUs are i			 * num_cpus_frozen tracks how many CPUs are i
		 * resume sequence. As long as this is not th			 * resume sequence. As long as this is not th
		 * operation in the resume sequence, just bui			 * operation in the resume sequence, just bui
		 * domain, ignoring cpusets.					 * domain, ignoring cpusets.
		 */								 */
		partition_sched_domains(1, NULL, NULL);				partition_sched_domains(1, NULL, NULL);
		if (--num_cpus_frozen)						if (--num_cpus_frozen)
			return;								return;
		/*								/*
		 * This is the last CPU online operation. So 			 * This is the last CPU online operation. So 
		 * restore the original sched domains by cons			 * restore the original sched domains by cons
		 * cpuset configurations.					 * cpuset configurations.
		 */								 */
		cpuset_force_rebuild();						cpuset_force_rebuild();
	}								}
	cpuset_update_active_cpus();					cpuset_update_active_cpus();
}								}

static int cpuset_cpu_inactive(unsigned int cpu)		static int cpuset_cpu_inactive(unsigned int cpu)
{								{
	if (!cpuhp_tasks_frozen) {					if (!cpuhp_tasks_frozen) {
		int ret = dl_bw_check_overflow(cpu);				int ret = dl_bw_check_overflow(cpu);

		if (ret)							if (ret)
			return ret;							return ret;
		cpuset_update_active_cpus();					cpuset_update_active_cpus();
	} else {							} else {
		num_cpus_frozen++;						num_cpus_frozen++;
		partition_sched_domains(1, NULL, NULL);				partition_sched_domains(1, NULL, NULL);
	}								}
	return 0;							return 0;
}								}

int sched_cpu_activate(unsigned int cpu)			int sched_cpu_activate(unsigned int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct rq_flags rf;						struct rq_flags rf;

	/*								/*
	 * Clear the balance_push callback and prepare to sch		 * Clear the balance_push callback and prepare to sch
	 * regular tasks.						 * regular tasks.
	 */								 */
	balance_push_set(cpu, false);					balance_push_set(cpu, false);

#ifdef CONFIG_SCHED_SMT						#ifdef CONFIG_SCHED_SMT
	/*								/*
	 * When going up, increment the number of cores with 		 * When going up, increment the number of cores with 
	 */								 */
	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)			if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
		static_branch_inc_cpuslocked(&sched_smt_prese			static_branch_inc_cpuslocked(&sched_smt_prese
#endif								#endif
	set_cpu_active(cpu, true);					set_cpu_active(cpu, true);

	if (sched_smp_initialized) {					if (sched_smp_initialized) {
		sched_update_numa(cpu, true);					sched_update_numa(cpu, true);
		sched_domains_numa_masks_set(cpu);				sched_domains_numa_masks_set(cpu);
		cpuset_cpu_active();						cpuset_cpu_active();
	}								}

	/*								/*
	 * Put the rq online, if not already. This happens:		 * Put the rq online, if not already. This happens:
	 *								 *
	 * 1) In the early boot process, because we build the		 * 1) In the early boot process, because we build the
	 *    after all CPUs have been brought up.			 *    after all CPUs have been brought up.
	 *								 *
	 * 2) At runtime, if cpuset_cpu_active() fails to reb		 * 2) At runtime, if cpuset_cpu_active() fails to reb
	 *    domains.							 *    domains.
	 */								 */
	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	if (rq->rd) {							if (rq->rd) {
		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
		set_rq_online(rq);						set_rq_online(rq);
	}								}
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);

	return 0;							return 0;
}								}

int sched_cpu_deactivate(unsigned int cpu)			int sched_cpu_deactivate(unsigned int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct rq_flags rf;						struct rq_flags rf;
	int ret;							int ret;

	/*								/*
	 * Remove CPU from nohz.idle_cpus_mask to prevent par		 * Remove CPU from nohz.idle_cpus_mask to prevent par
	 * load balancing when not active				 * load balancing when not active
	 */								 */
	nohz_balance_exit_idle(rq);					nohz_balance_exit_idle(rq);

	set_cpu_active(cpu, false);					set_cpu_active(cpu, false);

	/*								/*
	 * From this point forward, this CPU will refuse to r		 * From this point forward, this CPU will refuse to r
	 * is not: migrate_disable() or KTHREAD_IS_PER_CPU, a		 * is not: migrate_disable() or KTHREAD_IS_PER_CPU, a
	 * push those tasks away until this gets cleared, see		 * push those tasks away until this gets cleared, see
	 * sched_cpu_dying().						 * sched_cpu_dying().
	 */								 */
	balance_push_set(cpu, true);					balance_push_set(cpu, true);

	/*								/*
	 * We've cleared cpu_active_mask / set balance_push, 		 * We've cleared cpu_active_mask / set balance_push, 
	 * preempt-disabled and RCU users of this state to go		 * preempt-disabled and RCU users of this state to go
	 * all new such users will observe it.				 * all new such users will observe it.
	 *								 *
	 * Specifically, we rely on ttwu to no longer target 		 * Specifically, we rely on ttwu to no longer target 
	 * ttwu_queue_cond() and is_cpu_allowed().			 * ttwu_queue_cond() and is_cpu_allowed().
	 *								 *
	 * Do sync before park smpboot threads to take care t		 * Do sync before park smpboot threads to take care t
	 */								 */
	synchronize_rcu();						synchronize_rcu();

	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	if (rq->rd) {							if (rq->rd) {
		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
		set_rq_offline(rq);						set_rq_offline(rq);
	}								}
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);

#ifdef CONFIG_SCHED_SMT						#ifdef CONFIG_SCHED_SMT
	/*								/*
	 * When going down, decrement the number of cores wit		 * When going down, decrement the number of cores wit
	 */								 */
	if (cpumask_weight(cpu_smt_mask(cpu)) == 2)			if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
		static_branch_dec_cpuslocked(&sched_smt_prese			static_branch_dec_cpuslocked(&sched_smt_prese

	sched_core_cpu_deactivate(cpu);					sched_core_cpu_deactivate(cpu);
#endif								#endif

	if (!sched_smp_initialized)					if (!sched_smp_initialized)
		return 0;							return 0;

	sched_update_numa(cpu, false);					sched_update_numa(cpu, false);
	ret = cpuset_cpu_inactive(cpu);					ret = cpuset_cpu_inactive(cpu);
	if (ret) {							if (ret) {
		balance_push_set(cpu, false);					balance_push_set(cpu, false);
		set_cpu_active(cpu, true);					set_cpu_active(cpu, true);
		sched_update_numa(cpu, true);					sched_update_numa(cpu, true);
		return ret;							return ret;
	}								}
	sched_domains_numa_masks_clear(cpu);				sched_domains_numa_masks_clear(cpu);
	return 0;							return 0;
}								}

static void sched_rq_cpu_starting(unsigned int cpu)		static void sched_rq_cpu_starting(unsigned int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);

	rq->calc_load_update = calc_load_update;			rq->calc_load_update = calc_load_update;
	update_max_interval();						update_max_interval();
}								}

int sched_cpu_starting(unsigned int cpu)			int sched_cpu_starting(unsigned int cpu)
{								{
	sched_core_cpu_starting(cpu);					sched_core_cpu_starting(cpu);
	sched_rq_cpu_starting(cpu);					sched_rq_cpu_starting(cpu);
	sched_tick_start(cpu);						sched_tick_start(cpu);
	return 0;							return 0;
}								}

#ifdef CONFIG_HOTPLUG_CPU					#ifdef CONFIG_HOTPLUG_CPU

/*								/*
 * Invoked immediately before the stopper thread is invoked t	 * Invoked immediately before the stopper thread is invoked t
 * CPU down completely. At this point all per CPU kthreads ex	 * CPU down completely. At this point all per CPU kthreads ex
 * hotplug thread (current) and the stopper thread (inactive)	 * hotplug thread (current) and the stopper thread (inactive)
 * either parked or have been unbound from the outgoing CPU. 	 * either parked or have been unbound from the outgoing CPU. 
 * any of those which might be on the way out are gone.		 * any of those which might be on the way out are gone.
 *								 *
 * If after this point a bound task is being woken on this CP	 * If after this point a bound task is being woken on this CP
 * responsible hotplug callback has failed to do it's job.	 * responsible hotplug callback has failed to do it's job.
 * sched_cpu_dying() will catch it with the appropriate firew	 * sched_cpu_dying() will catch it with the appropriate firew
 */								 */
int sched_cpu_wait_empty(unsigned int cpu)			int sched_cpu_wait_empty(unsigned int cpu)
{								{
	balance_hotplug_wait();						balance_hotplug_wait();
	return 0;							return 0;
}								}

/*								/*
 * Since this CPU is going 'away' for a while, fold any nr_ac	 * Since this CPU is going 'away' for a while, fold any nr_ac
 * might have. Called from the CPU stopper task after ensurin	 * might have. Called from the CPU stopper task after ensurin
 * stopper is the last running task on the CPU, so nr_active 	 * stopper is the last running task on the CPU, so nr_active 
 * stable. We need to take the teardown thread which is calli	 * stable. We need to take the teardown thread which is calli
 * account, so we hand in adjust = 1 to the load calculation.	 * account, so we hand in adjust = 1 to the load calculation.
 *								 *
 * Also see the comment "Global load-average calculations".	 * Also see the comment "Global load-average calculations".
 */								 */
static void calc_load_migrate(struct rq *rq)			static void calc_load_migrate(struct rq *rq)
{								{
	long delta = calc_load_fold_active(rq, 1);			long delta = calc_load_fold_active(rq, 1);

	if (delta)							if (delta)
		atomic_long_add(delta, &calc_load_tasks);			atomic_long_add(delta, &calc_load_tasks);
}								}

static void dump_rq_tasks(struct rq *rq, const char *loglvl)	static void dump_rq_tasks(struct rq *rq, const char *loglvl)
{								{
	struct task_struct *g, *p;					struct task_struct *g, *p;
	int cpu = cpu_of(rq);						int cpu = cpu_of(rq);

	lockdep_assert_rq_held(rq);					lockdep_assert_rq_held(rq);

	printk("%sCPU%d enqueued tasks (%u total):\n", loglvl		printk("%sCPU%d enqueued tasks (%u total):\n", loglvl
	for_each_process_thread(g, p) {					for_each_process_thread(g, p) {
		if (task_cpu(p) != cpu)						if (task_cpu(p) != cpu)
			continue;							continue;

		if (!task_on_rq_queued(p))					if (!task_on_rq_queued(p))
			continue;							continue;

		printk("%s\tpid: %d, name: %s\n", loglvl, p->			printk("%s\tpid: %d, name: %s\n", loglvl, p->
	}								}
}								}

int sched_cpu_dying(unsigned int cpu)				int sched_cpu_dying(unsigned int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct rq_flags rf;						struct rq_flags rf;

	/* Handle pending wakeups and then migrate everything		/* Handle pending wakeups and then migrate everything
	sched_tick_stop(cpu);						sched_tick_stop(cpu);

	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {		if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
		WARN(true, "Dying CPU not properly vacated!")			WARN(true, "Dying CPU not properly vacated!")
		dump_rq_tasks(rq, KERN_WARNING);				dump_rq_tasks(rq, KERN_WARNING);
	}								}
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);

	calc_load_migrate(rq);						calc_load_migrate(rq);
	update_max_interval();						update_max_interval();
	hrtick_clear(rq);						hrtick_clear(rq);
	sched_core_cpu_dying(cpu);					sched_core_cpu_dying(cpu);
	return 0;							return 0;
}								}
#endif								#endif

void __init sched_init_smp(void)				void __init sched_init_smp(void)
{								{
	sched_init_numa(NUMA_NO_NODE);					sched_init_numa(NUMA_NO_NODE);

	/*								/*
	 * There's no userspace yet to cause hotplug operatio		 * There's no userspace yet to cause hotplug operatio
	 * CPU masks are stable and all blatant races in the 		 * CPU masks are stable and all blatant races in the 
	 * happen.							 * happen.
	 */								 */
	mutex_lock(&sched_domains_mutex);				mutex_lock(&sched_domains_mutex);
	sched_init_domains(cpu_active_mask);				sched_init_domains(cpu_active_mask);
	mutex_unlock(&sched_domains_mutex);				mutex_unlock(&sched_domains_mutex);

	/* Move init over to a non-isolated CPU */			/* Move init over to a non-isolated CPU */
	if (set_cpus_allowed_ptr(current, housekeeping_cpumas		if (set_cpus_allowed_ptr(current, housekeeping_cpumas
		BUG();								BUG();
	current->flags &= ~PF_NO_SETAFFINITY;				current->flags &= ~PF_NO_SETAFFINITY;
	sched_init_granularity();					sched_init_granularity();

	init_sched_rt_class();						init_sched_rt_class();
	init_sched_dl_class();						init_sched_dl_class();

	sched_smp_initialized = true;					sched_smp_initialized = true;
}								}

static int __init migration_init(void)				static int __init migration_init(void)
{								{
	sched_cpu_starting(smp_processor_id());				sched_cpu_starting(smp_processor_id());
	return 0;							return 0;
}								}
early_initcall(migration_init);					early_initcall(migration_init);

#else								#else
void __init sched_init_smp(void)				void __init sched_init_smp(void)
{								{
	sched_init_granularity();					sched_init_granularity();
}								}
#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */

int in_sched_functions(unsigned long addr)			int in_sched_functions(unsigned long addr)
{								{
	return in_lock_functions(addr) ||				return in_lock_functions(addr) ||
		(addr >= (unsigned long)__sched_text_start			(addr >= (unsigned long)__sched_text_start
		&& addr < (unsigned long)__sched_text_end);			&& addr < (unsigned long)__sched_text_end);
}								}

#ifdef CONFIG_CGROUP_SCHED					#ifdef CONFIG_CGROUP_SCHED
/*								/*
 * Default task group.						 * Default task group.
 * Every task in system belongs to this group at bootup.	 * Every task in system belongs to this group at bootup.
 */								 */
struct task_group root_task_group;				struct task_group root_task_group;
LIST_HEAD(task_groups);						LIST_HEAD(task_groups);

/* Cacheline aligned slab cache for task_group */		/* Cacheline aligned slab cache for task_group */
static struct kmem_cache *task_group_cache __read_mostly;	static struct kmem_cache *task_group_cache __read_mostly;
#endif								#endif

void __init sched_init(void)					void __init sched_init(void)
{								{
	unsigned long ptr = 0;						unsigned long ptr = 0;
	int i;								int i;

	/* Make sure the linker didn't screw up */			/* Make sure the linker didn't screw up */
	BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||		BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
	       &fair_sched_class != &rt_sched_class + 1 ||		       &fair_sched_class != &rt_sched_class + 1 ||
	       &rt_sched_class   != &dl_sched_class + 1);		       &rt_sched_class   != &dl_sched_class + 1);
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	BUG_ON(&dl_sched_class != &stop_sched_class + 1);		BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif								#endif

	wait_bit_init();						wait_bit_init();

#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
	ptr += 2 * nr_cpu_ids * sizeof(void **);			ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif								#endif
#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
	ptr += 2 * nr_cpu_ids * sizeof(void **);			ptr += 2 * nr_cpu_ids * sizeof(void **);
#endif								#endif
	if (ptr) {							if (ptr) {
		ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT)			ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT)

#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
		root_task_group.se = (struct sched_entity **)			root_task_group.se = (struct sched_entity **)
		ptr += nr_cpu_ids * sizeof(void **);				ptr += nr_cpu_ids * sizeof(void **);

		root_task_group.cfs_rq = (struct cfs_rq **)pt			root_task_group.cfs_rq = (struct cfs_rq **)pt
		ptr += nr_cpu_ids * sizeof(void **);				ptr += nr_cpu_ids * sizeof(void **);

		root_task_group.shares = ROOT_TASK_GROUP_LOAD			root_task_group.shares = ROOT_TASK_GROUP_LOAD
		init_cfs_bandwidth(&root_task_group.cfs_bandw			init_cfs_bandwidth(&root_task_group.cfs_bandw
#endif /* CONFIG_FAIR_GROUP_SCHED */				#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
		root_task_group.rt_se = (struct sched_rt_enti			root_task_group.rt_se = (struct sched_rt_enti
		ptr += nr_cpu_ids * sizeof(void **);				ptr += nr_cpu_ids * sizeof(void **);

		root_task_group.rt_rq = (struct rt_rq **)ptr;			root_task_group.rt_rq = (struct rt_rq **)ptr;
		ptr += nr_cpu_ids * sizeof(void **);				ptr += nr_cpu_ids * sizeof(void **);

#endif /* CONFIG_RT_GROUP_SCHED */				#endif /* CONFIG_RT_GROUP_SCHED */
	}								}

	init_rt_bandwidth(&def_rt_bandwidth, global_rt_period		init_rt_bandwidth(&def_rt_bandwidth, global_rt_period

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	init_defrootdomain();						init_defrootdomain();
#endif								#endif

#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
	init_rt_bandwidth(&root_task_group.rt_bandwidth,		init_rt_bandwidth(&root_task_group.rt_bandwidth,
			global_rt_period(), global_rt_runtime				global_rt_period(), global_rt_runtime
#endif /* CONFIG_RT_GROUP_SCHED */				#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_CGROUP_SCHED					#ifdef CONFIG_CGROUP_SCHED
	task_group_cache = KMEM_CACHE(task_group, 0);			task_group_cache = KMEM_CACHE(task_group, 0);

	list_add(&root_task_group.list, &task_groups);			list_add(&root_task_group.list, &task_groups);
	INIT_LIST_HEAD(&root_task_group.children);			INIT_LIST_HEAD(&root_task_group.children);
	INIT_LIST_HEAD(&root_task_group.siblings);			INIT_LIST_HEAD(&root_task_group.siblings);
	autogroup_init(&init_task);					autogroup_init(&init_task);
#endif /* CONFIG_CGROUP_SCHED */				#endif /* CONFIG_CGROUP_SCHED */

	for_each_possible_cpu(i) {					for_each_possible_cpu(i) {
		struct rq *rq;							struct rq *rq;

		rq = cpu_rq(i);							rq = cpu_rq(i);
		raw_spin_lock_init(&rq->__lock);				raw_spin_lock_init(&rq->__lock);
		rq->nr_running = 0;						rq->nr_running = 0;
		rq->calc_load_active = 0;					rq->calc_load_active = 0;
		rq->calc_load_update = jiffies + LOAD_FREQ;			rq->calc_load_update = jiffies + LOAD_FREQ;
		init_cfs_rq(&rq->cfs);						init_cfs_rq(&rq->cfs);
		init_rt_rq(&rq->rt);						init_rt_rq(&rq->rt);
		init_dl_rq(&rq->dl);						init_dl_rq(&rq->dl);
#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);				INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
		/*								/*
		 * How much CPU bandwidth does root_task_grou			 * How much CPU bandwidth does root_task_grou
		 *								 *
		 * In case of task-groups formed thr' the cgr			 * In case of task-groups formed thr' the cgr
		 * gets 100% of the CPU resources in the syst			 * gets 100% of the CPU resources in the syst
		 * system CPU resource is divided among the t			 * system CPU resource is divided among the t
		 * root_task_group and its child task-groups 			 * root_task_group and its child task-groups 
		 * based on each entity's (task or task-group			 * based on each entity's (task or task-group
		 * (se->load.weight).						 * (se->load.weight).
		 *								 *
		 * In other words, if root_task_group has 10 			 * In other words, if root_task_group has 10 
		 * 1024) and two child groups A0 and A1 (of w			 * 1024) and two child groups A0 and A1 (of w
		 * then A0's share of the CPU resource is:			 * then A0's share of the CPU resource is:
		 *								 *
		 *	A0's bandwidth = 1024 / (10*1024 + 10			 *	A0's bandwidth = 1024 / (10*1024 + 10
		 *								 *
		 * We achieve this by letting root_task_group			 * We achieve this by letting root_task_group
		 * directly in rq->cfs (i.e root_task_group->			 * directly in rq->cfs (i.e root_task_group->
		 */								 */
		init_tg_cfs_entry(&root_task_group, &rq->cfs,			init_tg_cfs_entry(&root_task_group, &rq->cfs,
#endif /* CONFIG_FAIR_GROUP_SCHED */				#endif /* CONFIG_FAIR_GROUP_SCHED */

		rq->rt.rt_runtime = def_rt_bandwidth.rt_runti			rq->rt.rt_runtime = def_rt_bandwidth.rt_runti
#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
		init_tg_rt_entry(&root_task_group, &rq->rt, N			init_tg_rt_entry(&root_task_group, &rq->rt, N
#endif								#endif
#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
		rq->sd = NULL;							rq->sd = NULL;
		rq->rd = NULL;							rq->rd = NULL;
		rq->cpu_capacity = rq->cpu_capacity_orig = SC			rq->cpu_capacity = rq->cpu_capacity_orig = SC
		rq->balance_callback = &balance_push_callback			rq->balance_callback = &balance_push_callback
		rq->active_balance = 0;						rq->active_balance = 0;
		rq->next_balance = jiffies;					rq->next_balance = jiffies;
		rq->push_cpu = 0;						rq->push_cpu = 0;
		rq->cpu = i;							rq->cpu = i;
		rq->online = 0;							rq->online = 0;
		rq->idle_stamp = 0;						rq->idle_stamp = 0;
		rq->avg_idle = 2*sysctl_sched_migration_cost;			rq->avg_idle = 2*sysctl_sched_migration_cost;
		rq->wake_stamp = jiffies;					rq->wake_stamp = jiffies;
		rq->wake_avg_idle = rq->avg_idle;				rq->wake_avg_idle = rq->avg_idle;
		rq->max_idle_balance_cost = sysctl_sched_migr			rq->max_idle_balance_cost = sysctl_sched_migr

		INIT_LIST_HEAD(&rq->cfs_tasks);					INIT_LIST_HEAD(&rq->cfs_tasks);

		rq_attach_root(rq, &def_root_domain);				rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON					#ifdef CONFIG_NO_HZ_COMMON
		rq->last_blocked_load_update_tick = jiffies;			rq->last_blocked_load_update_tick = jiffies;
		atomic_set(&rq->nohz_flags, 0);					atomic_set(&rq->nohz_flags, 0);

		INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);			INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq);
#endif								#endif
#ifdef CONFIG_HOTPLUG_CPU					#ifdef CONFIG_HOTPLUG_CPU
		rcuwait_init(&rq->hotplug_wait);				rcuwait_init(&rq->hotplug_wait);
#endif								#endif
#endif /* CONFIG_SMP */						#endif /* CONFIG_SMP */
		hrtick_rq_init(rq);						hrtick_rq_init(rq);
		atomic_set(&rq->nr_iowait, 0);					atomic_set(&rq->nr_iowait, 0);

#ifdef CONFIG_SCHED_CORE					#ifdef CONFIG_SCHED_CORE
		rq->core = rq;							rq->core = rq;
		rq->core_pick = NULL;						rq->core_pick = NULL;
		rq->core_enabled = 0;						rq->core_enabled = 0;
		rq->core_tree = RB_ROOT;					rq->core_tree = RB_ROOT;
		rq->core_forceidle_count = 0;					rq->core_forceidle_count = 0;
		rq->core_forceidle_occupation = 0;				rq->core_forceidle_occupation = 0;
		rq->core_forceidle_start = 0;					rq->core_forceidle_start = 0;

		rq->core_cookie = 0UL;						rq->core_cookie = 0UL;
#endif								#endif
		zalloc_cpumask_var_node(&rq->scratch_mask, GF			zalloc_cpumask_var_node(&rq->scratch_mask, GF
	}								}

	set_load_weight(&init_task, false);				set_load_weight(&init_task, false);

	/*								/*
	 * The boot idle thread does lazy MMU switching as we		 * The boot idle thread does lazy MMU switching as we
	 */								 */
	mmgrab_lazy_tlb(&init_mm);					mmgrab_lazy_tlb(&init_mm);
	enter_lazy_tlb(&init_mm, current);				enter_lazy_tlb(&init_mm, current);

	/*								/*
	 * The idle task doesn't need the kthread struct to f		 * The idle task doesn't need the kthread struct to f
	 * is dressed up as a per-CPU kthread and thus needs 		 * is dressed up as a per-CPU kthread and thus needs 
	 * if we want to avoid special-casing it in code that		 * if we want to avoid special-casing it in code that
	 * kthreads.							 * kthreads.
	 */								 */
	WARN_ON(!set_kthread_struct(current));				WARN_ON(!set_kthread_struct(current));

	/*								/*
	 * Make us the idle thread. Technically, schedule() s		 * Make us the idle thread. Technically, schedule() s
	 * called from this thread, however somewhere below i		 * called from this thread, however somewhere below i
	 * but because we are the idle thread, we just pick u		 * but because we are the idle thread, we just pick u
	 * when this runqueue becomes "idle".				 * when this runqueue becomes "idle".
	 */								 */
	init_idle(current, smp_processor_id());				init_idle(current, smp_processor_id());

	calc_load_update = jiffies + LOAD_FREQ;				calc_load_update = jiffies + LOAD_FREQ;

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
	idle_thread_set_boot_cpu();					idle_thread_set_boot_cpu();
	balance_push_set(smp_processor_id(), false);			balance_push_set(smp_processor_id(), false);
#endif								#endif
	init_sched_fair_class();					init_sched_fair_class();

	psi_init();							psi_init();

	init_uclamp();							init_uclamp();

	preempt_dynamic_init();						preempt_dynamic_init();

	scheduler_running = 1;						scheduler_running = 1;
}								}

#ifdef CONFIG_DEBUG_ATOMIC_SLEEP				#ifdef CONFIG_DEBUG_ATOMIC_SLEEP

void __might_sleep(const char *file, int line)			void __might_sleep(const char *file, int line)
{								{
	unsigned int state = get_current_state();			unsigned int state = get_current_state();
	/*								/*
	 * Blocking primitives will set (and therefore destro		 * Blocking primitives will set (and therefore destro
	 * since we will exit with TASK_RUNNING make sure we 		 * since we will exit with TASK_RUNNING make sure we 
	 * otherwise we will destroy state.				 * otherwise we will destroy state.
	 */								 */
	WARN_ONCE(state != TASK_RUNNING && current->task_stat		WARN_ONCE(state != TASK_RUNNING && current->task_stat
			"do not call blocking ops when !TASK_				"do not call blocking ops when !TASK_
			"state=%x set at [<%p>] %pS\n", state				"state=%x set at [<%p>] %pS\n", state
			(void *)current->task_state_change,				(void *)current->task_state_change,
			(void *)current->task_state_change);				(void *)current->task_state_change);

	__might_resched(file, line, 0);					__might_resched(file, line, 0);
}								}
EXPORT_SYMBOL(__might_sleep);					EXPORT_SYMBOL(__might_sleep);

static void print_preempt_disable_ip(int preempt_offset, unsi	static void print_preempt_disable_ip(int preempt_offset, unsi
{								{
	if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))				if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
		return;								return;

	if (preempt_count() == preempt_offset)				if (preempt_count() == preempt_offset)
		return;								return;

	pr_err("Preemption disabled at:");				pr_err("Preemption disabled at:");
	print_ip_sym(KERN_ERR, ip);					print_ip_sym(KERN_ERR, ip);
}								}

static inline bool resched_offsets_ok(unsigned int offsets)	static inline bool resched_offsets_ok(unsigned int offsets)
{								{
	unsigned int nested = preempt_count();				unsigned int nested = preempt_count();

	nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SH		nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SH

	return nested == offsets;					return nested == offsets;
}								}

void __might_resched(const char *file, int line, unsigned int	void __might_resched(const char *file, int line, unsigned int
{								{
	/* Ratelimiting timestamp: */					/* Ratelimiting timestamp: */
	static unsigned long prev_jiffy;				static unsigned long prev_jiffy;

	unsigned long preempt_disable_ip;				unsigned long preempt_disable_ip;

	/* WARN_ON_ONCE() by default, no rate limit required:		/* WARN_ON_ONCE() by default, no rate limit required:
	rcu_sleep_check();						rcu_sleep_check();

	if ((resched_offsets_ok(offsets) && !irqs_disabled() 		if ((resched_offsets_ok(offsets) && !irqs_disabled() 
	     !is_idle_task(current) && !current->non_block_co		     !is_idle_task(current) && !current->non_block_co
	    system_state == SYSTEM_BOOTING || system_state > 		    system_state == SYSTEM_BOOTING || system_state > 
	    oops_in_progress)						    oops_in_progress)
		return;								return;

	if (time_before(jiffies, prev_jiffy + HZ) && prev_jif		if (time_before(jiffies, prev_jiffy + HZ) && prev_jif
		return;								return;
	prev_jiffy = jiffies;						prev_jiffy = jiffies;

	/* Save this before calling printk(), since that will		/* Save this before calling printk(), since that will
	preempt_disable_ip = get_preempt_disable_ip(current);		preempt_disable_ip = get_preempt_disable_ip(current);

	pr_err("BUG: sleeping function called from invalid co		pr_err("BUG: sleeping function called from invalid co
	       file, line);						       file, line);
	pr_err("in_atomic(): %d, irqs_disabled(): %d, non_blo		pr_err("in_atomic(): %d, irqs_disabled(): %d, non_blo
	       in_atomic(), irqs_disabled(), current->non_blo		       in_atomic(), irqs_disabled(), current->non_blo
	       current->pid, current->comm);				       current->pid, current->comm);
	pr_err("preempt_count: %x, expected: %x\n", preempt_c		pr_err("preempt_count: %x, expected: %x\n", preempt_c
	       offsets & MIGHT_RESCHED_PREEMPT_MASK);			       offsets & MIGHT_RESCHED_PREEMPT_MASK);

	if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {				if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
		pr_err("RCU nest depth: %d, expected: %u\n",			pr_err("RCU nest depth: %d, expected: %u\n",
		       rcu_preempt_depth(), offsets >> MIGHT_			       rcu_preempt_depth(), offsets >> MIGHT_
	}								}

	if (task_stack_end_corrupted(current))				if (task_stack_end_corrupted(current))
		pr_emerg("Thread overran stack, or stack corr			pr_emerg("Thread overran stack, or stack corr

	debug_show_held_locks(current);					debug_show_held_locks(current);
	if (irqs_disabled())						if (irqs_disabled())
		print_irqtrace_events(current);					print_irqtrace_events(current);

	print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREE		print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREE
				 preempt_disable_ip);						 preempt_disable_ip);

	dump_stack();							dump_stack();
	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);			add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}								}
EXPORT_SYMBOL(__might_resched);					EXPORT_SYMBOL(__might_resched);

void __cant_sleep(const char *file, int line, int preempt_off	void __cant_sleep(const char *file, int line, int preempt_off
{								{
	static unsigned long prev_jiffy;				static unsigned long prev_jiffy;

	if (irqs_disabled())						if (irqs_disabled())
		return;								return;

	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))				if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
		return;								return;

	if (preempt_count() > preempt_offset)				if (preempt_count() > preempt_offset)
		return;								return;

	if (time_before(jiffies, prev_jiffy + HZ) && prev_jif		if (time_before(jiffies, prev_jiffy + HZ) && prev_jif
		return;								return;
	prev_jiffy = jiffies;						prev_jiffy = jiffies;

	printk(KERN_ERR "BUG: assuming atomic context at %s:%		printk(KERN_ERR "BUG: assuming atomic context at %s:%
	printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d		printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d
			in_atomic(), irqs_disabled(),					in_atomic(), irqs_disabled(),
			current->pid, current->comm);					current->pid, current->comm);

	debug_show_held_locks(current);					debug_show_held_locks(current);
	dump_stack();							dump_stack();
	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);			add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}								}
EXPORT_SYMBOL_GPL(__cant_sleep);				EXPORT_SYMBOL_GPL(__cant_sleep);

#ifdef CONFIG_SMP						#ifdef CONFIG_SMP
void __cant_migrate(const char *file, int line)			void __cant_migrate(const char *file, int line)
{								{
	static unsigned long prev_jiffy;				static unsigned long prev_jiffy;

	if (irqs_disabled())						if (irqs_disabled())
		return;								return;

	if (is_migration_disabled(current))				if (is_migration_disabled(current))
		return;								return;

	if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))				if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
		return;								return;

	if (preempt_count() > 0)					if (preempt_count() > 0)
		return;								return;

	if (time_before(jiffies, prev_jiffy + HZ) && prev_jif		if (time_before(jiffies, prev_jiffy + HZ) && prev_jif
		return;								return;
	prev_jiffy = jiffies;						prev_jiffy = jiffies;

	pr_err("BUG: assuming non migratable context at %s:%d		pr_err("BUG: assuming non migratable context at %s:%d
	pr_err("in_atomic(): %d, irqs_disabled(): %d, migrati		pr_err("in_atomic(): %d, irqs_disabled(): %d, migrati
	       in_atomic(), irqs_disabled(), is_migration_dis		       in_atomic(), irqs_disabled(), is_migration_dis
	       current->pid, current->comm);				       current->pid, current->comm);

	debug_show_held_locks(current);					debug_show_held_locks(current);
	dump_stack();							dump_stack();
	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);			add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}								}
EXPORT_SYMBOL_GPL(__cant_migrate);				EXPORT_SYMBOL_GPL(__cant_migrate);
#endif								#endif
#endif								#endif

#ifdef CONFIG_MAGIC_SYSRQ					#ifdef CONFIG_MAGIC_SYSRQ
void normalize_rt_tasks(void)					void normalize_rt_tasks(void)
{								{
	struct task_struct *g, *p;					struct task_struct *g, *p;
	struct sched_attr attr = {					struct sched_attr attr = {
		.sched_policy = SCHED_NORMAL,					.sched_policy = SCHED_NORMAL,
	};								};

	read_lock(&tasklist_lock);					read_lock(&tasklist_lock);
	for_each_process_thread(g, p) {					for_each_process_thread(g, p) {
		/*								/*
		 * Only normalize user tasks:					 * Only normalize user tasks:
		 */								 */
		if (p->flags & PF_KTHREAD)					if (p->flags & PF_KTHREAD)
			continue;							continue;

		p->se.exec_start = 0;						p->se.exec_start = 0;
		schedstat_set(p->stats.wait_start,  0);				schedstat_set(p->stats.wait_start,  0);
		schedstat_set(p->stats.sleep_start, 0);				schedstat_set(p->stats.sleep_start, 0);
		schedstat_set(p->stats.block_start, 0);				schedstat_set(p->stats.block_start, 0);

		if (!dl_task(p) && !rt_task(p)) {				if (!dl_task(p) && !rt_task(p)) {
			/*								/*
			 * Renice negative nice level userspa				 * Renice negative nice level userspa
			 * tasks back to 0:						 * tasks back to 0:
			 */								 */
			if (task_nice(p) < 0)						if (task_nice(p) < 0)
				set_user_nice(p, 0);						set_user_nice(p, 0);
			continue;							continue;
		}								}

		__sched_setscheduler(p, &attr, false, false);			__sched_setscheduler(p, &attr, false, false);
	}								}
	read_unlock(&tasklist_lock);					read_unlock(&tasklist_lock);
}								}

#endif /* CONFIG_MAGIC_SYSRQ */					#endif /* CONFIG_MAGIC_SYSRQ */

#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)		#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
/*								/*
 * These functions are only useful for the IA64 MCA handling,	 * These functions are only useful for the IA64 MCA handling,
 *								 *
 * They can only be called when the whole system has been	 * They can only be called when the whole system has been
 * stopped - every CPU needs to be quiescent, and no scheduli	 * stopped - every CPU needs to be quiescent, and no scheduli
 * activity can take place. Using them for anything else woul	 * activity can take place. Using them for anything else woul
 * be a serious bug, and as a result, they aren't even visibl	 * be a serious bug, and as a result, they aren't even visibl
 * under any other configuration.				 * under any other configuration.
 */								 */

/**								/**
 * curr_task - return the current task for a given CPU.		 * curr_task - return the current task for a given CPU.
 * @cpu: the processor in question.				 * @cpu: the processor in question.
 *								 *
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!			 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
 *								 *
 * Return: The current task for @cpu.				 * Return: The current task for @cpu.
 */								 */
struct task_struct *curr_task(int cpu)				struct task_struct *curr_task(int cpu)
{								{
	return cpu_curr(cpu);						return cpu_curr(cpu);
}								}

#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */	#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */

#ifdef CONFIG_IA64						#ifdef CONFIG_IA64
/**								/**
 * ia64_set_curr_task - set the current task for a given CPU.	 * ia64_set_curr_task - set the current task for a given CPU.
 * @cpu: the processor in question.				 * @cpu: the processor in question.
 * @p: the task pointer to set.					 * @p: the task pointer to set.
 *								 *
 * Description: This function must only be used when non-mask	 * Description: This function must only be used when non-mask
 * are serviced on a separate stack. It allows the architectu	 * are serviced on a separate stack. It allows the architectu
 * notion of the current task on a CPU in a non-blocking mann	 * notion of the current task on a CPU in a non-blocking mann
 * must be called with all CPU's synchronized, and interrupts	 * must be called with all CPU's synchronized, and interrupts
 * and caller must save the original value of the current tas	 * and caller must save the original value of the current tas
 * curr_task() above) and restore that value before reenablin	 * curr_task() above) and restore that value before reenablin
 * re-starting the system.					 * re-starting the system.
 *								 *
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!			 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
 */								 */
void ia64_set_curr_task(int cpu, struct task_struct *p)		void ia64_set_curr_task(int cpu, struct task_struct *p)
{								{
	cpu_curr(cpu) = p;						cpu_curr(cpu) = p;
}								}

#endif								#endif

#ifdef CONFIG_CGROUP_SCHED					#ifdef CONFIG_CGROUP_SCHED
/* task_group_lock serializes the addition/removal of task gr	/* task_group_lock serializes the addition/removal of task gr
static DEFINE_SPINLOCK(task_group_lock);			static DEFINE_SPINLOCK(task_group_lock);

static inline void alloc_uclamp_sched_group(struct task_group	static inline void alloc_uclamp_sched_group(struct task_group
					    struct task_group						    struct task_group
{								{
#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;

	for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
		uclamp_se_set(&tg->uclamp_req[clamp_id],			uclamp_se_set(&tg->uclamp_req[clamp_id],
			      uclamp_none(clamp_id), false);				      uclamp_none(clamp_id), false);
		tg->uclamp[clamp_id] = parent->uclamp[clamp_i			tg->uclamp[clamp_id] = parent->uclamp[clamp_i
	}								}
#endif								#endif
}								}

static void sched_free_group(struct task_group *tg)		static void sched_free_group(struct task_group *tg)
{								{
	free_fair_sched_group(tg);					free_fair_sched_group(tg);
	free_rt_sched_group(tg);					free_rt_sched_group(tg);
	autogroup_free(tg);						autogroup_free(tg);
	kmem_cache_free(task_group_cache, tg);				kmem_cache_free(task_group_cache, tg);
}								}

static void sched_free_group_rcu(struct rcu_head *rcu)		static void sched_free_group_rcu(struct rcu_head *rcu)
{								{
	sched_free_group(container_of(rcu, struct task_group,		sched_free_group(container_of(rcu, struct task_group,
}								}

static void sched_unregister_group(struct task_group *tg)	static void sched_unregister_group(struct task_group *tg)
{								{
	unregister_fair_sched_group(tg);				unregister_fair_sched_group(tg);
	unregister_rt_sched_group(tg);					unregister_rt_sched_group(tg);
	/*								/*
	 * We have to wait for yet another RCU grace period t		 * We have to wait for yet another RCU grace period t
	 * print_cfs_stats() might run concurrently.			 * print_cfs_stats() might run concurrently.
	 */								 */
	call_rcu(&tg->rcu, sched_free_group_rcu);			call_rcu(&tg->rcu, sched_free_group_rcu);
}								}

/* allocate runqueue etc for a new task group */		/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *pare	struct task_group *sched_create_group(struct task_group *pare
{								{
	struct task_group *tg;						struct task_group *tg;

	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | 		tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | 
	if (!tg)							if (!tg)
		return ERR_PTR(-ENOMEM);					return ERR_PTR(-ENOMEM);

	if (!alloc_fair_sched_group(tg, parent))			if (!alloc_fair_sched_group(tg, parent))
		goto err;							goto err;

	if (!alloc_rt_sched_group(tg, parent))				if (!alloc_rt_sched_group(tg, parent))
		goto err;							goto err;

	alloc_uclamp_sched_group(tg, parent);				alloc_uclamp_sched_group(tg, parent);

	return tg;							return tg;

err:								err:
	sched_free_group(tg);						sched_free_group(tg);
	return ERR_PTR(-ENOMEM);					return ERR_PTR(-ENOMEM);
}								}

void sched_online_group(struct task_group *tg, struct task_gr	void sched_online_group(struct task_group *tg, struct task_gr
{								{
	unsigned long flags;						unsigned long flags;

	spin_lock_irqsave(&task_group_lock, flags);			spin_lock_irqsave(&task_group_lock, flags);
	list_add_rcu(&tg->list, &task_groups);				list_add_rcu(&tg->list, &task_groups);

	/* Root should already exist: */				/* Root should already exist: */
	WARN_ON(!parent);						WARN_ON(!parent);

	tg->parent = parent;						tg->parent = parent;
	INIT_LIST_HEAD(&tg->children);					INIT_LIST_HEAD(&tg->children);
	list_add_rcu(&tg->siblings, &parent->children);			list_add_rcu(&tg->siblings, &parent->children);
	spin_unlock_irqrestore(&task_group_lock, flags);		spin_unlock_irqrestore(&task_group_lock, flags);

	online_fair_sched_group(tg);					online_fair_sched_group(tg);
}								}

/* rcu callback to free various structures associated with a 	/* rcu callback to free various structures associated with a 
static void sched_unregister_group_rcu(struct rcu_head *rhp)	static void sched_unregister_group_rcu(struct rcu_head *rhp)
{								{
	/* Now it should be safe to free those cfs_rqs: */		/* Now it should be safe to free those cfs_rqs: */
	sched_unregister_group(container_of(rhp, struct task_		sched_unregister_group(container_of(rhp, struct task_
}								}

void sched_destroy_group(struct task_group *tg)			void sched_destroy_group(struct task_group *tg)
{								{
	/* Wait for possible concurrent references to cfs_rqs		/* Wait for possible concurrent references to cfs_rqs
	call_rcu(&tg->rcu, sched_unregister_group_rcu);			call_rcu(&tg->rcu, sched_unregister_group_rcu);
}								}

void sched_release_group(struct task_group *tg)			void sched_release_group(struct task_group *tg)
{								{
	unsigned long flags;						unsigned long flags;

	/*								/*
	 * Unlink first, to avoid walk_tg_tree_from() from fi		 * Unlink first, to avoid walk_tg_tree_from() from fi
	 * sched_cfs_period_timer()).					 * sched_cfs_period_timer()).
	 *								 *
	 * For this to be effective, we have to wait for all 		 * For this to be effective, we have to wait for all 
	 * this task group to leave their RCU critical sectio		 * this task group to leave their RCU critical sectio
	 * user will see our dying task group any more. Speci		 * user will see our dying task group any more. Speci
	 * that tg_unthrottle_up() won't add decayed cfs_rq's		 * that tg_unthrottle_up() won't add decayed cfs_rq's
	 *								 *
	 * We therefore defer calling unregister_fair_sched_g		 * We therefore defer calling unregister_fair_sched_g
	 * sched_unregister_group() which is guarantied to ge		 * sched_unregister_group() which is guarantied to ge
	 * current RCU grace period has expired.			 * current RCU grace period has expired.
	 */								 */
	spin_lock_irqsave(&task_group_lock, flags);			spin_lock_irqsave(&task_group_lock, flags);
	list_del_rcu(&tg->list);					list_del_rcu(&tg->list);
	list_del_rcu(&tg->siblings);					list_del_rcu(&tg->siblings);
	spin_unlock_irqrestore(&task_group_lock, flags);		spin_unlock_irqrestore(&task_group_lock, flags);
}								}

static struct task_group *sched_get_task_group(struct task_st	static struct task_group *sched_get_task_group(struct task_st
{								{
	struct task_group *tg;						struct task_group *tg;

	/*								/*
	 * All callers are synchronized by task_rq_lock(); we		 * All callers are synchronized by task_rq_lock(); we
	 * which is pointless here. Thus, we pass "true" to t		 * which is pointless here. Thus, we pass "true" to t
	 * to prevent lockdep warnings.					 * to prevent lockdep warnings.
	 */								 */
	tg = container_of(task_css_check(tsk, cpu_cgrp_id, tr		tg = container_of(task_css_check(tsk, cpu_cgrp_id, tr
			  struct task_group, css);					  struct task_group, css);
	tg = autogroup_task_group(tsk, tg);				tg = autogroup_task_group(tsk, tg);

	return tg;							return tg;
}								}

static void sched_change_group(struct task_struct *tsk, struc	static void sched_change_group(struct task_struct *tsk, struc
{								{
	tsk->sched_task_group = group;					tsk->sched_task_group = group;

#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
	if (tsk->sched_class->task_change_group)			if (tsk->sched_class->task_change_group)
		tsk->sched_class->task_change_group(tsk);			tsk->sched_class->task_change_group(tsk);
	else								else
#endif								#endif
		set_task_rq(tsk, task_cpu(tsk));				set_task_rq(tsk, task_cpu(tsk));
}								}

/*								/*
 * Change task's runqueue when it moves between groups.		 * Change task's runqueue when it moves between groups.
 *								 *
 * The caller of this function should have put the task in it	 * The caller of this function should have put the task in it
 * now. This function just updates tsk->se.cfs_rq and tsk->se	 * now. This function just updates tsk->se.cfs_rq and tsk->se
 * its new group.						 * its new group.
 */								 */
void sched_move_task(struct task_struct *tsk)			void sched_move_task(struct task_struct *tsk)
{								{
	int queued, running, queue_flags =				int queued, running, queue_flags =
		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK			DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK
	struct task_group *group;					struct task_group *group;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	rq = task_rq_lock(tsk, &rf);					rq = task_rq_lock(tsk, &rf);
	/*								/*
	 * Esp. with SCHED_AUTOGROUP enabled it is possible t		 * Esp. with SCHED_AUTOGROUP enabled it is possible t
	 * group changes.						 * group changes.
	 */								 */
	group = sched_get_task_group(tsk);				group = sched_get_task_group(tsk);
	if (group == tsk->sched_task_group)				if (group == tsk->sched_task_group)
		goto unlock;							goto unlock;

	update_rq_clock(rq);						update_rq_clock(rq);

	running = task_current(rq, tsk);				running = task_current(rq, tsk);
	queued = task_on_rq_queued(tsk);				queued = task_on_rq_queued(tsk);

	if (queued)							if (queued)
		dequeue_task(rq, tsk, queue_flags);				dequeue_task(rq, tsk, queue_flags);
	if (running)							if (running)
		put_prev_task(rq, tsk);						put_prev_task(rq, tsk);

	sched_change_group(tsk, group);					sched_change_group(tsk, group);

	if (queued)							if (queued)
		enqueue_task(rq, tsk, queue_flags);				enqueue_task(rq, tsk, queue_flags);
	if (running) {							if (running) {
		set_next_task(rq, tsk);						set_next_task(rq, tsk);
		/*								/*
		 * After changing group, the running task may			 * After changing group, the running task may
		 * throttled one but it's still the running t			 * throttled one but it's still the running t
		 * resched to make sure that task can still r			 * resched to make sure that task can still r
		 */								 */
		resched_curr(rq);						resched_curr(rq);
	}								}

unlock:								unlock:
	task_rq_unlock(rq, tsk, &rf);					task_rq_unlock(rq, tsk, &rf);
}								}

static inline struct task_group *css_tg(struct cgroup_subsys_	static inline struct task_group *css_tg(struct cgroup_subsys_
{								{
	return css ? container_of(css, struct task_group, css		return css ? container_of(css, struct task_group, css
}								}

static struct cgroup_subsys_state *				static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)	cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{								{
	struct task_group *parent = css_tg(parent_css);			struct task_group *parent = css_tg(parent_css);
	struct task_group *tg;						struct task_group *tg;

	if (!parent) {							if (!parent) {
		/* This is early initialization for the top c			/* This is early initialization for the top c
		return &root_task_group.css;					return &root_task_group.css;
	}								}

	tg = sched_create_group(parent);				tg = sched_create_group(parent);
	if (IS_ERR(tg))							if (IS_ERR(tg))
		return ERR_PTR(-ENOMEM);					return ERR_PTR(-ENOMEM);

	return &tg->css;						return &tg->css;
}								}

/* Expose task group only after completing cgroup initializat	/* Expose task group only after completing cgroup initializat
static int cpu_cgroup_css_online(struct cgroup_subsys_state *	static int cpu_cgroup_css_online(struct cgroup_subsys_state *
{								{
	struct task_group *tg = css_tg(css);				struct task_group *tg = css_tg(css);
	struct task_group *parent = css_tg(css->parent);		struct task_group *parent = css_tg(css->parent);

	if (parent)							if (parent)
		sched_online_group(tg, parent);					sched_online_group(tg, parent);

#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
	/* Propagate the effective uclamp value for the new g		/* Propagate the effective uclamp value for the new g
	mutex_lock(&uclamp_mutex);					mutex_lock(&uclamp_mutex);
	rcu_read_lock();						rcu_read_lock();
	cpu_util_update_eff(css);					cpu_util_update_eff(css);
	rcu_read_unlock();						rcu_read_unlock();
	mutex_unlock(&uclamp_mutex);					mutex_unlock(&uclamp_mutex);
#endif								#endif

	return 0;							return 0;
}								}

static void cpu_cgroup_css_released(struct cgroup_subsys_stat	static void cpu_cgroup_css_released(struct cgroup_subsys_stat
{								{
	struct task_group *tg = css_tg(css);				struct task_group *tg = css_tg(css);

	sched_release_group(tg);					sched_release_group(tg);
}								}

static void cpu_cgroup_css_free(struct cgroup_subsys_state *c	static void cpu_cgroup_css_free(struct cgroup_subsys_state *c
{								{
	struct task_group *tg = css_tg(css);				struct task_group *tg = css_tg(css);

	/*								/*
	 * Relies on the RCU grace period between css_release		 * Relies on the RCU grace period between css_release
	 */								 */
	sched_unregister_group(tg);					sched_unregister_group(tg);
}								}

#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)	static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{								{
	struct task_struct *task;					struct task_struct *task;
	struct cgroup_subsys_state *css;				struct cgroup_subsys_state *css;

	cgroup_taskset_for_each(task, css, tset) {			cgroup_taskset_for_each(task, css, tset) {
		if (!sched_rt_can_attach(css_tg(css), task))			if (!sched_rt_can_attach(css_tg(css), task))
			return -EINVAL;							return -EINVAL;
	}								}
	return 0;							return 0;
}								}
#endif								#endif

static void cpu_cgroup_attach(struct cgroup_taskset *tset)	static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{								{
	struct task_struct *task;					struct task_struct *task;
	struct cgroup_subsys_state *css;				struct cgroup_subsys_state *css;

	cgroup_taskset_for_each(task, css, tset)			cgroup_taskset_for_each(task, css, tset)
		sched_move_task(task);						sched_move_task(task);
}								}

#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
static void cpu_util_update_eff(struct cgroup_subsys_state *c	static void cpu_util_update_eff(struct cgroup_subsys_state *c
{								{
	struct cgroup_subsys_state *top_css = css;			struct cgroup_subsys_state *top_css = css;
	struct uclamp_se *uc_parent = NULL;				struct uclamp_se *uc_parent = NULL;
	struct uclamp_se *uc_se = NULL;					struct uclamp_se *uc_se = NULL;
	unsigned int eff[UCLAMP_CNT];					unsigned int eff[UCLAMP_CNT];
	enum uclamp_id clamp_id;					enum uclamp_id clamp_id;
	unsigned int clamps;						unsigned int clamps;

	lockdep_assert_held(&uclamp_mutex);				lockdep_assert_held(&uclamp_mutex);
	SCHED_WARN_ON(!rcu_read_lock_held());				SCHED_WARN_ON(!rcu_read_lock_held());

	css_for_each_descendant_pre(css, top_css) {			css_for_each_descendant_pre(css, top_css) {
		uc_parent = css_tg(css)->parent					uc_parent = css_tg(css)->parent
			? css_tg(css)->parent->uclamp : NULL;				? css_tg(css)->parent->uclamp : NULL;

		for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
			/* Assume effective clamps matches re				/* Assume effective clamps matches re
			eff[clamp_id] = css_tg(css)->uclamp_r				eff[clamp_id] = css_tg(css)->uclamp_r
			/* Cap effective clamps with parent's				/* Cap effective clamps with parent's
			if (uc_parent &&						if (uc_parent &&
			    eff[clamp_id] > uc_parent[clamp_i				    eff[clamp_id] > uc_parent[clamp_i
				eff[clamp_id] = uc_parent[cla					eff[clamp_id] = uc_parent[cla
			}								}
		}								}
		/* Ensure protection is always capped by limi			/* Ensure protection is always capped by limi
		eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UC			eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UC

		/* Propagate most restrictive effective clamp			/* Propagate most restrictive effective clamp
		clamps = 0x0;							clamps = 0x0;
		uc_se = css_tg(css)->uclamp;					uc_se = css_tg(css)->uclamp;
		for_each_clamp_id(clamp_id) {					for_each_clamp_id(clamp_id) {
			if (eff[clamp_id] == uc_se[clamp_id].				if (eff[clamp_id] == uc_se[clamp_id].
				continue;							continue;
			uc_se[clamp_id].value = eff[clamp_id]				uc_se[clamp_id].value = eff[clamp_id]
			uc_se[clamp_id].bucket_id = uclamp_bu				uc_se[clamp_id].bucket_id = uclamp_bu
			clamps |= (0x1 << clamp_id);					clamps |= (0x1 << clamp_id);
		}								}
		if (!clamps) {							if (!clamps) {
			css = css_rightmost_descendant(css);				css = css_rightmost_descendant(css);
			continue;							continue;
		}								}

		/* Immediately update descendants RUNNABLE ta			/* Immediately update descendants RUNNABLE ta
		uclamp_update_active_tasks(css);				uclamp_update_active_tasks(css);
	}								}
}								}

/*								/*
 * Integer 10^N with a given N exponent by casting to integer	 * Integer 10^N with a given N exponent by casting to integer
 * C expression. Since there is no way to convert a macro arg	 * C expression. Since there is no way to convert a macro arg
 * character constant, use two levels of macros.		 * character constant, use two levels of macros.
 */								 */
#define _POW10(exp) ((unsigned int)1e##exp)			#define _POW10(exp) ((unsigned int)1e##exp)
#define POW10(exp) _POW10(exp)					#define POW10(exp) _POW10(exp)

struct uclamp_request {						struct uclamp_request {
#define UCLAMP_PERCENT_SHIFT	2				#define UCLAMP_PERCENT_SHIFT	2
#define UCLAMP_PERCENT_SCALE	(100 * POW10(UCLAMP_PERCENT_S	#define UCLAMP_PERCENT_SCALE	(100 * POW10(UCLAMP_PERCENT_S
	s64 percent;							s64 percent;
	u64 util;							u64 util;
	int ret;							int ret;
};								};

static inline struct uclamp_request				static inline struct uclamp_request
capacity_from_percent(char *buf)				capacity_from_percent(char *buf)
{								{
	struct uclamp_request req = {					struct uclamp_request req = {
		.percent = UCLAMP_PERCENT_SCALE,				.percent = UCLAMP_PERCENT_SCALE,
		.util = SCHED_CAPACITY_SCALE,					.util = SCHED_CAPACITY_SCALE,
		.ret = 0,							.ret = 0,
	};								};

	buf = strim(buf);						buf = strim(buf);
	if (strcmp(buf, "max")) {					if (strcmp(buf, "max")) {
		req.ret = cgroup_parse_float(buf, UCLAMP_PERC			req.ret = cgroup_parse_float(buf, UCLAMP_PERC
					     &req.percent);						     &req.percent);
		if (req.ret)							if (req.ret)
			return req;							return req;
		if ((u64)req.percent > UCLAMP_PERCENT_SCALE) 			if ((u64)req.percent > UCLAMP_PERCENT_SCALE) 
			req.ret = -ERANGE;						req.ret = -ERANGE;
			return req;							return req;
		}								}

		req.util = req.percent << SCHED_CAPACITY_SHIF			req.util = req.percent << SCHED_CAPACITY_SHIF
		req.util = DIV_ROUND_CLOSEST_ULL(req.util, UC			req.util = DIV_ROUND_CLOSEST_ULL(req.util, UC
	}								}

	return req;							return req;
}								}

static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, 	static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, 
				size_t nbytes, loff_t off,					size_t nbytes, loff_t off,
				enum uclamp_id clamp_id)					enum uclamp_id clamp_id)
{								{
	struct uclamp_request req;					struct uclamp_request req;
	struct task_group *tg;						struct task_group *tg;

	req = capacity_from_percent(buf);				req = capacity_from_percent(buf);
	if (req.ret)							if (req.ret)
		return req.ret;							return req.ret;

	static_branch_enable(&sched_uclamp_used);			static_branch_enable(&sched_uclamp_used);

	mutex_lock(&uclamp_mutex);					mutex_lock(&uclamp_mutex);
	rcu_read_lock();						rcu_read_lock();

	tg = css_tg(of_css(of));					tg = css_tg(of_css(of));
	if (tg->uclamp_req[clamp_id].value != req.util)			if (tg->uclamp_req[clamp_id].value != req.util)
		uclamp_se_set(&tg->uclamp_req[clamp_id], req.			uclamp_se_set(&tg->uclamp_req[clamp_id], req.

	/*								/*
	 * Because of not recoverable conversion rounding we 		 * Because of not recoverable conversion rounding we 
	 * exact requested value					 * exact requested value
	 */								 */
	tg->uclamp_pct[clamp_id] = req.percent;				tg->uclamp_pct[clamp_id] = req.percent;

	/* Update effective clamps to track the most restrict		/* Update effective clamps to track the most restrict
	cpu_util_update_eff(of_css(of));				cpu_util_update_eff(of_css(of));

	rcu_read_unlock();						rcu_read_unlock();
	mutex_unlock(&uclamp_mutex);					mutex_unlock(&uclamp_mutex);

	return nbytes;							return nbytes;
}								}

static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *	static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *
				    char *buf, size_t nbytes,					    char *buf, size_t nbytes,
				    loff_t off)							    loff_t off)
{								{
	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_		return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_
}								}

static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *	static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *
				    char *buf, size_t nbytes,					    char *buf, size_t nbytes,
				    loff_t off)							    loff_t off)
{								{
	return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_		return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_
}								}

static inline void cpu_uclamp_print(struct seq_file *sf,	static inline void cpu_uclamp_print(struct seq_file *sf,
				    enum uclamp_id clamp_id)					    enum uclamp_id clamp_id)
{								{
	struct task_group *tg;						struct task_group *tg;
	u64 util_clamp;							u64 util_clamp;
	u64 percent;							u64 percent;
	u32 rem;							u32 rem;

	rcu_read_lock();						rcu_read_lock();
	tg = css_tg(seq_css(sf));					tg = css_tg(seq_css(sf));
	util_clamp = tg->uclamp_req[clamp_id].value;			util_clamp = tg->uclamp_req[clamp_id].value;
	rcu_read_unlock();						rcu_read_unlock();

	if (util_clamp == SCHED_CAPACITY_SCALE) {			if (util_clamp == SCHED_CAPACITY_SCALE) {
		seq_puts(sf, "max\n");						seq_puts(sf, "max\n");
		return;								return;
	}								}

	percent = tg->uclamp_pct[clamp_id];				percent = tg->uclamp_pct[clamp_id];
	percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_S		percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_S
	seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT		seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT
}								}

static int cpu_uclamp_min_show(struct seq_file *sf, void *v)	static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
{								{
	cpu_uclamp_print(sf, UCLAMP_MIN);				cpu_uclamp_print(sf, UCLAMP_MIN);
	return 0;							return 0;
}								}

static int cpu_uclamp_max_show(struct seq_file *sf, void *v)	static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
{								{
	cpu_uclamp_print(sf, UCLAMP_MAX);				cpu_uclamp_print(sf, UCLAMP_MAX);
	return 0;							return 0;
}								}
#endif /* CONFIG_UCLAMP_TASK_GROUP */				#endif /* CONFIG_UCLAMP_TASK_GROUP */

#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
static int cpu_shares_write_u64(struct cgroup_subsys_state *c	static int cpu_shares_write_u64(struct cgroup_subsys_state *c
				struct cftype *cftype, u64 sh					struct cftype *cftype, u64 sh
{								{
	if (shareval > scale_load_down(ULONG_MAX))			if (shareval > scale_load_down(ULONG_MAX))
		shareval = MAX_SHARES;						shareval = MAX_SHARES;
	return sched_group_set_shares(css_tg(css), scale_load		return sched_group_set_shares(css_tg(css), scale_load
}								}

static u64 cpu_shares_read_u64(struct cgroup_subsys_state *cs	static u64 cpu_shares_read_u64(struct cgroup_subsys_state *cs
			       struct cftype *cft)					       struct cftype *cft)
{								{
	struct task_group *tg = css_tg(css);				struct task_group *tg = css_tg(css);

	return (u64) scale_load_down(tg->shares);			return (u64) scale_load_down(tg->shares);
}								}

#ifdef CONFIG_CFS_BANDWIDTH					#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);			static DEFINE_MUTEX(cfs_constraints_mutex);

const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */	const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /*	static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /*
/* More than 203 days if BW_SHIFT equals 20. */			/* More than 203 days if BW_SHIFT equals 20. */
static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;	static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;

static int __cfs_schedulable(struct task_group *tg, u64 perio	static int __cfs_schedulable(struct task_group *tg, u64 perio

static int tg_set_cfs_bandwidth(struct task_group *tg, u64 pe	static int tg_set_cfs_bandwidth(struct task_group *tg, u64 pe
				u64 burst)							u64 burst)
{								{
	int i, ret = 0, runtime_enabled, runtime_was_enabled;		int i, ret = 0, runtime_enabled, runtime_was_enabled;
	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

	if (tg == &root_task_group)					if (tg == &root_task_group)
		return -EINVAL;							return -EINVAL;

	/*								/*
	 * Ensure we have at some amount of bandwidth every p		 * Ensure we have at some amount of bandwidth every p
	 * to prevent reaching a state of large arrears when 		 * to prevent reaching a state of large arrears when 
	 * entity_tick() resulting in prolonged exit starvati		 * entity_tick() resulting in prolonged exit starvati
	 */								 */
	if (quota < min_cfs_quota_period || period < min_cfs_		if (quota < min_cfs_quota_period || period < min_cfs_
		return -EINVAL;							return -EINVAL;

	/*								/*
	 * Likewise, bound things on the other side by preven		 * Likewise, bound things on the other side by preven
	 * periods.  This also allows us to normalize in comp		 * periods.  This also allows us to normalize in comp
	 * feasibility.							 * feasibility.
	 */								 */
	if (period > max_cfs_quota_period)				if (period > max_cfs_quota_period)
		return -EINVAL;							return -EINVAL;

	/*								/*
	 * Bound quota to defend quota against overflow durin		 * Bound quota to defend quota against overflow durin
	 */								 */
	if (quota != RUNTIME_INF && quota > max_cfs_runtime)		if (quota != RUNTIME_INF && quota > max_cfs_runtime)
		return -EINVAL;							return -EINVAL;

	if (quota != RUNTIME_INF && (burst > quota ||			if (quota != RUNTIME_INF && (burst > quota ||
				     burst + quota > max_cfs_					     burst + quota > max_cfs_
		return -EINVAL;							return -EINVAL;

	/*								/*
	 * Prevent race between setting of cfs_rq->runtime_en		 * Prevent race between setting of cfs_rq->runtime_en
	 * unthrottle_offline_cfs_rqs().				 * unthrottle_offline_cfs_rqs().
	 */								 */
	guard(cpus_read_lock)();					guard(cpus_read_lock)();
	guard(mutex)(&cfs_constraints_mutex);				guard(mutex)(&cfs_constraints_mutex);

	ret = __cfs_schedulable(tg, period, quota);			ret = __cfs_schedulable(tg, period, quota);
	if (ret)							if (ret)
		return ret;							return ret;

	runtime_enabled = quota != RUNTIME_INF;				runtime_enabled = quota != RUNTIME_INF;
	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;		runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
	/*								/*
	 * If we need to toggle cfs_bandwidth_used, off->on m		 * If we need to toggle cfs_bandwidth_used, off->on m
	 * before making related changes, and on->off must oc		 * before making related changes, and on->off must oc
	 */								 */
	if (runtime_enabled && !runtime_was_enabled)			if (runtime_enabled && !runtime_was_enabled)
		cfs_bandwidth_usage_inc();					cfs_bandwidth_usage_inc();

	scoped_guard (raw_spinlock_irq, &cfs_b->lock) {			scoped_guard (raw_spinlock_irq, &cfs_b->lock) {
		cfs_b->period = ns_to_ktime(period);				cfs_b->period = ns_to_ktime(period);
		cfs_b->quota = quota;						cfs_b->quota = quota;
		cfs_b->burst = burst;						cfs_b->burst = burst;

		__refill_cfs_bandwidth_runtime(cfs_b);				__refill_cfs_bandwidth_runtime(cfs_b);

		/*								/*
		 * Restart the period timer (if active) to ha			 * Restart the period timer (if active) to ha
		 * period expiry:						 * period expiry:
		 */								 */
		if (runtime_enabled)						if (runtime_enabled)
			start_cfs_bandwidth(cfs_b);					start_cfs_bandwidth(cfs_b);
	}								}

	for_each_online_cpu(i) {					for_each_online_cpu(i) {
		struct cfs_rq *cfs_rq = tg->cfs_rq[i];				struct cfs_rq *cfs_rq = tg->cfs_rq[i];
		struct rq *rq = cfs_rq->rq;					struct rq *rq = cfs_rq->rq;

		guard(rq_lock_irq)(rq);						guard(rq_lock_irq)(rq);
		cfs_rq->runtime_enabled = runtime_enabled;			cfs_rq->runtime_enabled = runtime_enabled;
		cfs_rq->runtime_remaining = 0;					cfs_rq->runtime_remaining = 0;

		if (cfs_rq->throttled)						if (cfs_rq->throttled)
			unthrottle_cfs_rq(cfs_rq);					unthrottle_cfs_rq(cfs_rq);
	}								}

	if (runtime_was_enabled && !runtime_enabled)			if (runtime_was_enabled && !runtime_enabled)
		cfs_bandwidth_usage_dec();					cfs_bandwidth_usage_dec();

	return 0;							return 0;
}								}

static int tg_set_cfs_quota(struct task_group *tg, long cfs_q	static int tg_set_cfs_quota(struct task_group *tg, long cfs_q
{								{
	u64 quota, period, burst;					u64 quota, period, burst;

	period = ktime_to_ns(tg->cfs_bandwidth.period);			period = ktime_to_ns(tg->cfs_bandwidth.period);
	burst = tg->cfs_bandwidth.burst;				burst = tg->cfs_bandwidth.burst;
	if (cfs_quota_us < 0)						if (cfs_quota_us < 0)
		quota = RUNTIME_INF;						quota = RUNTIME_INF;
	else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC		else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC
		quota = (u64)cfs_quota_us * NSEC_PER_USEC;			quota = (u64)cfs_quota_us * NSEC_PER_USEC;
	else								else
		return -EINVAL;							return -EINVAL;

	return tg_set_cfs_bandwidth(tg, period, quota, burst)		return tg_set_cfs_bandwidth(tg, period, quota, burst)
}								}

static long tg_get_cfs_quota(struct task_group *tg)		static long tg_get_cfs_quota(struct task_group *tg)
{								{
	u64 quota_us;							u64 quota_us;

	if (tg->cfs_bandwidth.quota == RUNTIME_INF)			if (tg->cfs_bandwidth.quota == RUNTIME_INF)
		return -1;							return -1;

	quota_us = tg->cfs_bandwidth.quota;				quota_us = tg->cfs_bandwidth.quota;
	do_div(quota_us, NSEC_PER_USEC);				do_div(quota_us, NSEC_PER_USEC);

	return quota_us;						return quota_us;
}								}

static int tg_set_cfs_period(struct task_group *tg, long cfs_	static int tg_set_cfs_period(struct task_group *tg, long cfs_
{								{
	u64 quota, period, burst;					u64 quota, period, burst;

	if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)		if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC)
		return -EINVAL;							return -EINVAL;

	period = (u64)cfs_period_us * NSEC_PER_USEC;			period = (u64)cfs_period_us * NSEC_PER_USEC;
	quota = tg->cfs_bandwidth.quota;				quota = tg->cfs_bandwidth.quota;
	burst = tg->cfs_bandwidth.burst;				burst = tg->cfs_bandwidth.burst;

	return tg_set_cfs_bandwidth(tg, period, quota, burst)		return tg_set_cfs_bandwidth(tg, period, quota, burst)
}								}

static long tg_get_cfs_period(struct task_group *tg)		static long tg_get_cfs_period(struct task_group *tg)
{								{
	u64 cfs_period_us;						u64 cfs_period_us;

	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period)		cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period)
	do_div(cfs_period_us, NSEC_PER_USEC);				do_div(cfs_period_us, NSEC_PER_USEC);

	return cfs_period_us;						return cfs_period_us;
}								}

static int tg_set_cfs_burst(struct task_group *tg, long cfs_b	static int tg_set_cfs_burst(struct task_group *tg, long cfs_b
{								{
	u64 quota, period, burst;					u64 quota, period, burst;

	if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)		if ((u64)cfs_burst_us > U64_MAX / NSEC_PER_USEC)
		return -EINVAL;							return -EINVAL;

	burst = (u64)cfs_burst_us * NSEC_PER_USEC;			burst = (u64)cfs_burst_us * NSEC_PER_USEC;
	period = ktime_to_ns(tg->cfs_bandwidth.period);			period = ktime_to_ns(tg->cfs_bandwidth.period);
	quota = tg->cfs_bandwidth.quota;				quota = tg->cfs_bandwidth.quota;

	return tg_set_cfs_bandwidth(tg, period, quota, burst)		return tg_set_cfs_bandwidth(tg, period, quota, burst)
}								}

static long tg_get_cfs_burst(struct task_group *tg)		static long tg_get_cfs_burst(struct task_group *tg)
{								{
	u64 burst_us;							u64 burst_us;

	burst_us = tg->cfs_bandwidth.burst;				burst_us = tg->cfs_bandwidth.burst;
	do_div(burst_us, NSEC_PER_USEC);				do_div(burst_us, NSEC_PER_USEC);

	return burst_us;						return burst_us;
}								}

static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state 	static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state 
				  struct cftype *cft)						  struct cftype *cft)
{								{
	return tg_get_cfs_quota(css_tg(css));				return tg_get_cfs_quota(css_tg(css));
}								}

static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state	static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state
				   struct cftype *cftype, s64					   struct cftype *cftype, s64
{								{
	return tg_set_cfs_quota(css_tg(css), cfs_quota_us);		return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
}								}

static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state	static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state
				   struct cftype *cft)						   struct cftype *cft)
{								{
	return tg_get_cfs_period(css_tg(css));				return tg_get_cfs_period(css_tg(css));
}								}

static int cpu_cfs_period_write_u64(struct cgroup_subsys_stat	static int cpu_cfs_period_write_u64(struct cgroup_subsys_stat
				    struct cftype *cftype, u6					    struct cftype *cftype, u6
{								{
	return tg_set_cfs_period(css_tg(css), cfs_period_us);		return tg_set_cfs_period(css_tg(css), cfs_period_us);
}								}

static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state 	static u64 cpu_cfs_burst_read_u64(struct cgroup_subsys_state 
				  struct cftype *cft)						  struct cftype *cft)
{								{
	return tg_get_cfs_burst(css_tg(css));				return tg_get_cfs_burst(css_tg(css));
}								}

static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state	static int cpu_cfs_burst_write_u64(struct cgroup_subsys_state
				   struct cftype *cftype, u64					   struct cftype *cftype, u64
{								{
	return tg_set_cfs_burst(css_tg(css), cfs_burst_us);		return tg_set_cfs_burst(css_tg(css), cfs_burst_us);
}								}

struct cfs_schedulable_data {					struct cfs_schedulable_data {
	struct task_group *tg;						struct task_group *tg;
	u64 period, quota;						u64 period, quota;
};								};

/*								/*
 * normalize group quota/period to be quota/max_period		 * normalize group quota/period to be quota/max_period
 * note: units are usecs					 * note: units are usecs
 */								 */
static u64 normalize_cfs_quota(struct task_group *tg,		static u64 normalize_cfs_quota(struct task_group *tg,
			       struct cfs_schedulable_data *d				       struct cfs_schedulable_data *d
{								{
	u64 quota, period;						u64 quota, period;

	if (tg == d->tg) {						if (tg == d->tg) {
		period = d->period;						period = d->period;
		quota = d->quota;						quota = d->quota;
	} else {							} else {
		period = tg_get_cfs_period(tg);					period = tg_get_cfs_period(tg);
		quota = tg_get_cfs_quota(tg);					quota = tg_get_cfs_quota(tg);
	}								}

	/* note: these should typically be equivalent */		/* note: these should typically be equivalent */
	if (quota == RUNTIME_INF || quota == -1)			if (quota == RUNTIME_INF || quota == -1)
		return RUNTIME_INF;						return RUNTIME_INF;

	return to_ratio(period, quota);					return to_ratio(period, quota);
}								}

static int tg_cfs_schedulable_down(struct task_group *tg, voi	static int tg_cfs_schedulable_down(struct task_group *tg, voi
{								{
	struct cfs_schedulable_data *d = data;				struct cfs_schedulable_data *d = data;
	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
	s64 quota = 0, parent_quota = -1;				s64 quota = 0, parent_quota = -1;

	if (!tg->parent) {						if (!tg->parent) {
		quota = RUNTIME_INF;						quota = RUNTIME_INF;
	} else {							} else {
		struct cfs_bandwidth *parent_b = &tg->parent-			struct cfs_bandwidth *parent_b = &tg->parent-

		quota = normalize_cfs_quota(tg, d);				quota = normalize_cfs_quota(tg, d);
		parent_quota = parent_b->hierarchical_quota;			parent_quota = parent_b->hierarchical_quota;

		/*								/*
		 * Ensure max(child_quota) <= parent_quota.  			 * Ensure max(child_quota) <= parent_quota.  
		 * always take the non-RUNTIME_INF min.  On c			 * always take the non-RUNTIME_INF min.  On c
		 * inherit when no limit is set. In both case			 * inherit when no limit is set. In both case
		 * by the scheduler to determine if a given C			 * by the scheduler to determine if a given C
		 * bandwidth constraint at some higher level.			 * bandwidth constraint at some higher level.
		 */								 */
		if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {			if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
			if (quota == RUNTIME_INF)					if (quota == RUNTIME_INF)
				quota = parent_quota;						quota = parent_quota;
			else if (parent_quota != RUNTIME_INF)				else if (parent_quota != RUNTIME_INF)
				quota = min(quota, parent_quo					quota = min(quota, parent_quo
		} else {							} else {
			if (quota == RUNTIME_INF)					if (quota == RUNTIME_INF)
				quota = parent_quota;						quota = parent_quota;
			else if (parent_quota != RUNTIME_INF 				else if (parent_quota != RUNTIME_INF 
				return -EINVAL;							return -EINVAL;
		}								}
	}								}
	cfs_b->hierarchical_quota = quota;				cfs_b->hierarchical_quota = quota;

	return 0;							return 0;
}								}

static int __cfs_schedulable(struct task_group *tg, u64 perio	static int __cfs_schedulable(struct task_group *tg, u64 perio
{								{
	int ret;							int ret;
	struct cfs_schedulable_data data = {				struct cfs_schedulable_data data = {
		.tg = tg,							.tg = tg,
		.period = period,						.period = period,
		.quota = quota,							.quota = quota,
	};								};

	if (quota != RUNTIME_INF) {					if (quota != RUNTIME_INF) {
		do_div(data.period, NSEC_PER_USEC);				do_div(data.period, NSEC_PER_USEC);
		do_div(data.quota, NSEC_PER_USEC);				do_div(data.quota, NSEC_PER_USEC);
	}								}

	rcu_read_lock();						rcu_read_lock();
	ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &		ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &
	rcu_read_unlock();						rcu_read_unlock();

	return ret;							return ret;
}								}

static int cpu_cfs_stat_show(struct seq_file *sf, void *v)	static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{								{
	struct task_group *tg = css_tg(seq_css(sf));			struct task_group *tg = css_tg(seq_css(sf));
	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;

	seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);		seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
	seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttl		seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttl
	seq_printf(sf, "throttled_time %llu\n", cfs_b->thrott		seq_printf(sf, "throttled_time %llu\n", cfs_b->thrott

	if (schedstat_enabled() && tg != &root_task_group) {		if (schedstat_enabled() && tg != &root_task_group) {
		struct sched_statistics *stats;					struct sched_statistics *stats;
		u64 ws = 0;							u64 ws = 0;
		int i;								int i;

		for_each_possible_cpu(i) {					for_each_possible_cpu(i) {
			stats = __schedstats_from_se(tg->se[i				stats = __schedstats_from_se(tg->se[i
			ws += schedstat_val(stats->wait_sum);				ws += schedstat_val(stats->wait_sum);
		}								}

		seq_printf(sf, "wait_sum %llu\n", ws);				seq_printf(sf, "wait_sum %llu\n", ws);
	}								}

	seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);		seq_printf(sf, "nr_bursts %d\n", cfs_b->nr_burst);
	seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time		seq_printf(sf, "burst_time %llu\n", cfs_b->burst_time

	return 0;							return 0;
}								}

static u64 throttled_time_self(struct task_group *tg)		static u64 throttled_time_self(struct task_group *tg)
{								{
	int i;								int i;
	u64 total = 0;							u64 total = 0;

	for_each_possible_cpu(i) {					for_each_possible_cpu(i) {
		total += READ_ONCE(tg->cfs_rq[i]->throttled_c			total += READ_ONCE(tg->cfs_rq[i]->throttled_c
	}								}

	return total;							return total;
}								}

static int cpu_cfs_local_stat_show(struct seq_file *sf, void 	static int cpu_cfs_local_stat_show(struct seq_file *sf, void 
{								{
	struct task_group *tg = css_tg(seq_css(sf));			struct task_group *tg = css_tg(seq_css(sf));

	seq_printf(sf, "throttled_time %llu\n", throttled_tim		seq_printf(sf, "throttled_time %llu\n", throttled_tim

	return 0;							return 0;
}								}
#endif /* CONFIG_CFS_BANDWIDTH */				#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */				#endif /* CONFIG_FAIR_GROUP_SCHED */

#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *c	static int cpu_rt_runtime_write(struct cgroup_subsys_state *c
				struct cftype *cft, s64 val)					struct cftype *cft, s64 val)
{								{
	return sched_group_set_rt_runtime(css_tg(css), val);		return sched_group_set_rt_runtime(css_tg(css), val);
}								}

static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *cs	static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *cs
			       struct cftype *cft)					       struct cftype *cft)
{								{
	return sched_group_rt_runtime(css_tg(css));			return sched_group_rt_runtime(css_tg(css));
}								}

static int cpu_rt_period_write_uint(struct cgroup_subsys_stat	static int cpu_rt_period_write_uint(struct cgroup_subsys_stat
				    struct cftype *cftype, u6					    struct cftype *cftype, u6
{								{
	return sched_group_set_rt_period(css_tg(css), rt_peri		return sched_group_set_rt_period(css_tg(css), rt_peri
}								}

static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state	static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state
				   struct cftype *cft)						   struct cftype *cft)
{								{
	return sched_group_rt_period(css_tg(css));			return sched_group_rt_period(css_tg(css));
}								}
#endif /* CONFIG_RT_GROUP_SCHED */				#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,	static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
			       struct cftype *cft)					       struct cftype *cft)
{								{
	return css_tg(css)->idle;					return css_tg(css)->idle;
}								}

static int cpu_idle_write_s64(struct cgroup_subsys_state *css	static int cpu_idle_write_s64(struct cgroup_subsys_state *css
				struct cftype *cft, s64 idle)					struct cftype *cft, s64 idle)
{								{
	return sched_group_set_idle(css_tg(css), idle);			return sched_group_set_idle(css_tg(css), idle);
}								}
#endif								#endif

static struct cftype cpu_legacy_files[] = {			static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
	{								{
		.name = "shares",						.name = "shares",
		.read_u64 = cpu_shares_read_u64,				.read_u64 = cpu_shares_read_u64,
		.write_u64 = cpu_shares_write_u64,				.write_u64 = cpu_shares_write_u64,
	},								},
	{								{
		.name = "idle",							.name = "idle",
		.read_s64 = cpu_idle_read_s64,					.read_s64 = cpu_idle_read_s64,
		.write_s64 = cpu_idle_write_s64,				.write_s64 = cpu_idle_write_s64,
	},								},
#endif								#endif
#ifdef CONFIG_CFS_BANDWIDTH					#ifdef CONFIG_CFS_BANDWIDTH
	{								{
		.name = "cfs_quota_us",						.name = "cfs_quota_us",
		.read_s64 = cpu_cfs_quota_read_s64,				.read_s64 = cpu_cfs_quota_read_s64,
		.write_s64 = cpu_cfs_quota_write_s64,				.write_s64 = cpu_cfs_quota_write_s64,
	},								},
	{								{
		.name = "cfs_period_us",					.name = "cfs_period_us",
		.read_u64 = cpu_cfs_period_read_u64,				.read_u64 = cpu_cfs_period_read_u64,
		.write_u64 = cpu_cfs_period_write_u64,				.write_u64 = cpu_cfs_period_write_u64,
	},								},
	{								{
		.name = "cfs_burst_us",						.name = "cfs_burst_us",
		.read_u64 = cpu_cfs_burst_read_u64,				.read_u64 = cpu_cfs_burst_read_u64,
		.write_u64 = cpu_cfs_burst_write_u64,				.write_u64 = cpu_cfs_burst_write_u64,
	},								},
	{								{
		.name = "stat",							.name = "stat",
		.seq_show = cpu_cfs_stat_show,					.seq_show = cpu_cfs_stat_show,
	},								},
	{								{
		.name = "stat.local",						.name = "stat.local",
		.seq_show = cpu_cfs_local_stat_show,				.seq_show = cpu_cfs_local_stat_show,
	},								},
#endif								#endif
#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
	{								{
		.name = "rt_runtime_us",					.name = "rt_runtime_us",
		.read_s64 = cpu_rt_runtime_read,				.read_s64 = cpu_rt_runtime_read,
		.write_s64 = cpu_rt_runtime_write,				.write_s64 = cpu_rt_runtime_write,
	},								},
	{								{
		.name = "rt_period_us",						.name = "rt_period_us",
		.read_u64 = cpu_rt_period_read_uint,				.read_u64 = cpu_rt_period_read_uint,
		.write_u64 = cpu_rt_period_write_uint,				.write_u64 = cpu_rt_period_write_uint,
	},								},
#endif								#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
	{								{
		.name = "uclamp.min",						.name = "uclamp.min",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = cpu_uclamp_min_show,				.seq_show = cpu_uclamp_min_show,
		.write = cpu_uclamp_min_write,					.write = cpu_uclamp_min_write,
	},								},
	{								{
		.name = "uclamp.max",						.name = "uclamp.max",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = cpu_uclamp_max_show,				.seq_show = cpu_uclamp_max_show,
		.write = cpu_uclamp_max_write,					.write = cpu_uclamp_max_write,
	},								},
#endif								#endif
	{ }	/* Terminate */						{ }	/* Terminate */
};								};

static int cpu_extra_stat_show(struct seq_file *sf,		static int cpu_extra_stat_show(struct seq_file *sf,
			       struct cgroup_subsys_state *cs				       struct cgroup_subsys_state *cs
{								{
#ifdef CONFIG_CFS_BANDWIDTH					#ifdef CONFIG_CFS_BANDWIDTH
	{								{
		struct task_group *tg = css_tg(css);				struct task_group *tg = css_tg(css);
		struct cfs_bandwidth *cfs_b = &tg->cfs_bandwi			struct cfs_bandwidth *cfs_b = &tg->cfs_bandwi
		u64 throttled_usec, burst_usec;					u64 throttled_usec, burst_usec;

		throttled_usec = cfs_b->throttled_time;				throttled_usec = cfs_b->throttled_time;
		do_div(throttled_usec, NSEC_PER_USEC);				do_div(throttled_usec, NSEC_PER_USEC);
		burst_usec = cfs_b->burst_time;					burst_usec = cfs_b->burst_time;
		do_div(burst_usec, NSEC_PER_USEC);				do_div(burst_usec, NSEC_PER_USEC);

		seq_printf(sf, "nr_periods %d\n"				seq_printf(sf, "nr_periods %d\n"
			   "nr_throttled %d\n"						   "nr_throttled %d\n"
			   "throttled_usec %llu\n"					   "throttled_usec %llu\n"
			   "nr_bursts %d\n"						   "nr_bursts %d\n"
			   "burst_usec %llu\n",						   "burst_usec %llu\n",
			   cfs_b->nr_periods, cfs_b->nr_throt				   cfs_b->nr_periods, cfs_b->nr_throt
			   throttled_usec, cfs_b->nr_burst, b				   throttled_usec, cfs_b->nr_burst, b
	}								}
#endif								#endif
	return 0;							return 0;
}								}

static int cpu_local_stat_show(struct seq_file *sf,		static int cpu_local_stat_show(struct seq_file *sf,
			       struct cgroup_subsys_state *cs				       struct cgroup_subsys_state *cs
{								{
#ifdef CONFIG_CFS_BANDWIDTH					#ifdef CONFIG_CFS_BANDWIDTH
	{								{
		struct task_group *tg = css_tg(css);				struct task_group *tg = css_tg(css);
		u64 throttled_self_usec;					u64 throttled_self_usec;

		throttled_self_usec = throttled_time_self(tg)			throttled_self_usec = throttled_time_self(tg)
		do_div(throttled_self_usec, NSEC_PER_USEC);			do_div(throttled_self_usec, NSEC_PER_USEC);

		seq_printf(sf, "throttled_usec %llu\n",				seq_printf(sf, "throttled_usec %llu\n",
			   throttled_self_usec);					   throttled_self_usec);
	}								}
#endif								#endif
	return 0;							return 0;
}								}

#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *cs	static u64 cpu_weight_read_u64(struct cgroup_subsys_state *cs
			       struct cftype *cft)					       struct cftype *cft)
{								{
	struct task_group *tg = css_tg(css);				struct task_group *tg = css_tg(css);
	u64 weight = scale_load_down(tg->shares);			u64 weight = scale_load_down(tg->shares);

	return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_D		return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_D
}								}

static int cpu_weight_write_u64(struct cgroup_subsys_state *c	static int cpu_weight_write_u64(struct cgroup_subsys_state *c
				struct cftype *cft, u64 weigh					struct cftype *cft, u64 weigh
{								{
	/*								/*
	 * cgroup weight knobs should use the common MIN, DFL		 * cgroup weight knobs should use the common MIN, DFL
	 * values which are 1, 100 and 10000 respectively.  W		 * values which are 1, 100 and 10000 respectively.  W
	 * a bit of range on both ends, it maps pretty well o		 * a bit of range on both ends, it maps pretty well o
	 * value used by scheduler and the round-trip convers		 * value used by scheduler and the round-trip convers
	 * the original value over the entire range.			 * the original value over the entire range.
	 */								 */
	if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEI		if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEI
		return -ERANGE;							return -ERANGE;

	weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_		weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_

	return sched_group_set_shares(css_tg(css), scale_load		return sched_group_set_shares(css_tg(css), scale_load
}								}

static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_stat	static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_stat
				    struct cftype *cft)						    struct cftype *cft)
{								{
	unsigned long weight = scale_load_down(css_tg(css)->s		unsigned long weight = scale_load_down(css_tg(css)->s
	int last_delta = INT_MAX;					int last_delta = INT_MAX;
	int prio, delta;						int prio, delta;

	/* find the closest nice value to the current weight 		/* find the closest nice value to the current weight 
	for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight		for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight
		delta = abs(sched_prio_to_weight[prio] - weig			delta = abs(sched_prio_to_weight[prio] - weig
		if (delta >= last_delta)					if (delta >= last_delta)
			break;								break;
		last_delta = delta;						last_delta = delta;
	}								}

	return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);			return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
}								}

static int cpu_weight_nice_write_s64(struct cgroup_subsys_sta	static int cpu_weight_nice_write_s64(struct cgroup_subsys_sta
				     struct cftype *cft, s64 					     struct cftype *cft, s64 
{								{
	unsigned long weight;						unsigned long weight;
	int idx;							int idx;

	if (nice < MIN_NICE || nice > MAX_NICE)				if (nice < MIN_NICE || nice > MAX_NICE)
		return -ERANGE;							return -ERANGE;

	idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;				idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
	idx = array_index_nospec(idx, 40);				idx = array_index_nospec(idx, 40);
	weight = sched_prio_to_weight[idx];				weight = sched_prio_to_weight[idx];

	return sched_group_set_shares(css_tg(css), scale_load		return sched_group_set_shares(css_tg(css), scale_load
}								}
#endif								#endif

static void __maybe_unused cpu_period_quota_print(struct seq_	static void __maybe_unused cpu_period_quota_print(struct seq_
						  long period							  long period
{								{
	if (quota < 0)							if (quota < 0)
		seq_puts(sf, "max");						seq_puts(sf, "max");
	else								else
		seq_printf(sf, "%ld", quota);					seq_printf(sf, "%ld", quota);

	seq_printf(sf, " %ld\n", period);				seq_printf(sf, " %ld\n", period);
}								}

/* caller should put the current value in *@periodp before ca	/* caller should put the current value in *@periodp before ca
static int __maybe_unused cpu_period_quota_parse(char *buf,	static int __maybe_unused cpu_period_quota_parse(char *buf,
						 u64 *periodp							 u64 *periodp
{								{
	char tok[21];	/* U64_MAX */					char tok[21];	/* U64_MAX */

	if (sscanf(buf, "%20s %llu", tok, periodp) < 1)			if (sscanf(buf, "%20s %llu", tok, periodp) < 1)
		return -EINVAL;							return -EINVAL;

	*periodp *= NSEC_PER_USEC;					*periodp *= NSEC_PER_USEC;

	if (sscanf(tok, "%llu", quotap))				if (sscanf(tok, "%llu", quotap))
		*quotap *= NSEC_PER_USEC;					*quotap *= NSEC_PER_USEC;
	else if (!strcmp(tok, "max"))					else if (!strcmp(tok, "max"))
		*quotap = RUNTIME_INF;						*quotap = RUNTIME_INF;
	else								else
		return -EINVAL;							return -EINVAL;

	return 0;							return 0;
}								}

#ifdef CONFIG_CFS_BANDWIDTH					#ifdef CONFIG_CFS_BANDWIDTH
static int cpu_max_show(struct seq_file *sf, void *v)		static int cpu_max_show(struct seq_file *sf, void *v)
{								{
	struct task_group *tg = css_tg(seq_css(sf));			struct task_group *tg = css_tg(seq_css(sf));

	cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_		cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_
	return 0;							return 0;
}								}

static ssize_t cpu_max_write(struct kernfs_open_file *of,	static ssize_t cpu_max_write(struct kernfs_open_file *of,
			     char *buf, size_t nbytes, loff_t				     char *buf, size_t nbytes, loff_t
{								{
	struct task_group *tg = css_tg(of_css(of));			struct task_group *tg = css_tg(of_css(of));
	u64 period = tg_get_cfs_period(tg);				u64 period = tg_get_cfs_period(tg);
	u64 burst = tg_get_cfs_burst(tg);				u64 burst = tg_get_cfs_burst(tg);
	u64 quota;							u64 quota;
	int ret;							int ret;

	ret = cpu_period_quota_parse(buf, &period, &quota);		ret = cpu_period_quota_parse(buf, &period, &quota);
	if (!ret)							if (!ret)
		ret = tg_set_cfs_bandwidth(tg, period, quota,			ret = tg_set_cfs_bandwidth(tg, period, quota,
	return ret ?: nbytes;						return ret ?: nbytes;
}								}
#endif								#endif

static struct cftype cpu_files[] = {				static struct cftype cpu_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED					#ifdef CONFIG_FAIR_GROUP_SCHED
	{								{
		.name = "weight",						.name = "weight",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.read_u64 = cpu_weight_read_u64,				.read_u64 = cpu_weight_read_u64,
		.write_u64 = cpu_weight_write_u64,				.write_u64 = cpu_weight_write_u64,
	},								},
	{								{
		.name = "weight.nice",						.name = "weight.nice",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.read_s64 = cpu_weight_nice_read_s64,				.read_s64 = cpu_weight_nice_read_s64,
		.write_s64 = cpu_weight_nice_write_s64,				.write_s64 = cpu_weight_nice_write_s64,
	},								},
	{								{
		.name = "idle",							.name = "idle",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.read_s64 = cpu_idle_read_s64,					.read_s64 = cpu_idle_read_s64,
		.write_s64 = cpu_idle_write_s64,				.write_s64 = cpu_idle_write_s64,
	},								},
#endif								#endif
#ifdef CONFIG_CFS_BANDWIDTH					#ifdef CONFIG_CFS_BANDWIDTH
	{								{
		.name = "max",							.name = "max",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = cpu_max_show,					.seq_show = cpu_max_show,
		.write = cpu_max_write,						.write = cpu_max_write,
	},								},
	{								{
		.name = "max.burst",						.name = "max.burst",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.read_u64 = cpu_cfs_burst_read_u64,				.read_u64 = cpu_cfs_burst_read_u64,
		.write_u64 = cpu_cfs_burst_write_u64,				.write_u64 = cpu_cfs_burst_write_u64,
	},								},
#endif								#endif
#ifdef CONFIG_UCLAMP_TASK_GROUP					#ifdef CONFIG_UCLAMP_TASK_GROUP
	{								{
		.name = "uclamp.min",						.name = "uclamp.min",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = cpu_uclamp_min_show,				.seq_show = cpu_uclamp_min_show,
		.write = cpu_uclamp_min_write,					.write = cpu_uclamp_min_write,
	},								},
	{								{
		.name = "uclamp.max",						.name = "uclamp.max",
		.flags = CFTYPE_NOT_ON_ROOT,					.flags = CFTYPE_NOT_ON_ROOT,
		.seq_show = cpu_uclamp_max_show,				.seq_show = cpu_uclamp_max_show,
		.write = cpu_uclamp_max_write,					.write = cpu_uclamp_max_write,
	},								},
#endif								#endif
	{ }	/* terminate */						{ }	/* terminate */
};								};

struct cgroup_subsys cpu_cgrp_subsys = {			struct cgroup_subsys cpu_cgrp_subsys = {
	.css_alloc	= cpu_cgroup_css_alloc,				.css_alloc	= cpu_cgroup_css_alloc,
	.css_online	= cpu_cgroup_css_online,			.css_online	= cpu_cgroup_css_online,
	.css_released	= cpu_cgroup_css_released,			.css_released	= cpu_cgroup_css_released,
	.css_free	= cpu_cgroup_css_free,				.css_free	= cpu_cgroup_css_free,
	.css_extra_stat_show = cpu_extra_stat_show,			.css_extra_stat_show = cpu_extra_stat_show,
	.css_local_stat_show = cpu_local_stat_show,			.css_local_stat_show = cpu_local_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED					#ifdef CONFIG_RT_GROUP_SCHED
	.can_attach	= cpu_cgroup_can_attach,			.can_attach	= cpu_cgroup_can_attach,
#endif								#endif
	.attach		= cpu_cgroup_attach,				.attach		= cpu_cgroup_attach,
	.legacy_cftypes	= cpu_legacy_files,				.legacy_cftypes	= cpu_legacy_files,
	.dfl_cftypes	= cpu_files,					.dfl_cftypes	= cpu_files,
	.early_init	= true,						.early_init	= true,
	.threaded	= true,						.threaded	= true,
};								};

#endif	/* CONFIG_CGROUP_SCHED */				#endif	/* CONFIG_CGROUP_SCHED */

void dump_cpu_task(int cpu)					void dump_cpu_task(int cpu)
{								{
	if (cpu == smp_processor_id() && in_hardirq()) {		if (cpu == smp_processor_id() && in_hardirq()) {
		struct pt_regs *regs;						struct pt_regs *regs;

		regs = get_irq_regs();						regs = get_irq_regs();
		if (regs) {							if (regs) {
			show_regs(regs);						show_regs(regs);
			return;								return;
		}								}
	}								}

	if (trigger_single_cpu_backtrace(cpu))				if (trigger_single_cpu_backtrace(cpu))
		return;								return;

	pr_info("Task dump for CPU %d:\n", cpu);			pr_info("Task dump for CPU %d:\n", cpu);
	sched_show_task(cpu_curr(cpu));					sched_show_task(cpu_curr(cpu));
}								}

/*								/*
 * Nice levels are multiplicative, with a gentle 10% change f	 * Nice levels are multiplicative, with a gentle 10% change f
 * nice level changed. I.e. when a CPU-bound task goes from n	 * nice level changed. I.e. when a CPU-bound task goes from n
 * nice 1, it will get ~10% less CPU time than another CPU-bo	 * nice 1, it will get ~10% less CPU time than another CPU-bo
 * that remained on nice 0.					 * that remained on nice 0.
 *								 *
 * The "10% effect" is relative and cumulative: from _any_ ni	 * The "10% effect" is relative and cumulative: from _any_ ni
 * if you go up 1 level, it's -10% CPU usage, if you go down 	 * if you go up 1 level, it's -10% CPU usage, if you go down 
 * it's +10% CPU usage. (to achieve that we use a multiplier 	 * it's +10% CPU usage. (to achieve that we use a multiplier 
 * If a task goes up by ~10% and another task goes down by ~1	 * If a task goes up by ~10% and another task goes down by ~1
 * the relative distance between them is ~25%.)			 * the relative distance between them is ~25%.)
 */								 */
const int sched_prio_to_weight[40] = {				const int sched_prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36	 /* -20 */     88761,     71755,     56483,     46273,     36
 /* -15 */     29154,     23254,     18705,     14949,     11	 /* -15 */     29154,     23254,     18705,     14949,     11
 /* -10 */      9548,      7620,      6100,      4904,      3	 /* -10 */      9548,      7620,      6100,      4904,      3
 /*  -5 */      3121,      2501,      1991,      1586,      1	 /*  -5 */      3121,      2501,      1991,      1586,      1
 /*   0 */      1024,       820,       655,       526,       	 /*   0 */      1024,       820,       655,       526,       
 /*   5 */       335,       272,       215,       172,       	 /*   5 */       335,       272,       215,       172,       
 /*  10 */       110,        87,        70,        56,       	 /*  10 */       110,        87,        70,        56,       
 /*  15 */        36,        29,        23,        18,       	 /*  15 */        36,        29,        23,        18,       
};								};

/*								/*
 * Inverse (2^32/x) values of the sched_prio_to_weight[] arra	 * Inverse (2^32/x) values of the sched_prio_to_weight[] arra
 *								 *
 * In cases where the weight does not change often, we can us	 * In cases where the weight does not change often, we can us
 * precalculated inverse to speed up arithmetics by turning d	 * precalculated inverse to speed up arithmetics by turning d
 * into multiplications:					 * into multiplications:
 */								 */
const u32 sched_prio_to_wmult[40] = {				const u32 sched_prio_to_wmult[40] = {
 /* -20 */     48388,     59856,     76040,     92818,    118	 /* -20 */     48388,     59856,     76040,     92818,    118
 /* -15 */    147320,    184698,    229616,    287308,    360	 /* -15 */    147320,    184698,    229616,    287308,    360
 /* -10 */    449829,    563644,    704093,    875809,   1099	 /* -10 */    449829,    563644,    704093,    875809,   1099
 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363	 /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363
 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153	 /*   0 */   4194304,   5237765,   6557202,   8165337,  10153
 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350	 /*   5 */  12820798,  15790321,  19976592,  24970740,  31350
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443	 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331	 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331
};								};

void call_trace_sched_update_nr_running(struct rq *rq, int co	void call_trace_sched_update_nr_running(struct rq *rq, int co
{								{
        trace_sched_update_nr_running_tp(rq, count);		        trace_sched_update_nr_running_tp(rq, count);
}								}

#ifdef CONFIG_SCHED_MM_CID					#ifdef CONFIG_SCHED_MM_CID

/*								/*
 * @cid_lock: Guarantee forward-progress of cid allocation.	 * @cid_lock: Guarantee forward-progress of cid allocation.
 *								 *
 * Concurrency ID allocation within a bitmap is mostly lock-f	 * Concurrency ID allocation within a bitmap is mostly lock-f
 * is only used when contention is detected by the lock-free 	 * is only used when contention is detected by the lock-free 
 * forward progress can be guaranteed.				 * forward progress can be guaranteed.
 */								 */
DEFINE_RAW_SPINLOCK(cid_lock);					DEFINE_RAW_SPINLOCK(cid_lock);

/*								/*
 * @use_cid_lock: Select cid allocation behavior: lock-free v	 * @use_cid_lock: Select cid allocation behavior: lock-free v
 *								 *
 * When @use_cid_lock is 0, the cid allocation is lock-free. 	 * When @use_cid_lock is 0, the cid allocation is lock-free. 
 * detected, it is set to 1 to ensure that all newly coming a	 * detected, it is set to 1 to ensure that all newly coming a
 * serialized by @cid_lock until the allocation which detecte	 * serialized by @cid_lock until the allocation which detecte
 * completes and sets @use_cid_lock back to 0. This guarantee	 * completes and sets @use_cid_lock back to 0. This guarantee
 * of a cid allocation.						 * of a cid allocation.
 */								 */
int use_cid_lock;						int use_cid_lock;

/*								/*
 * mm_cid remote-clear implements a lock-free algorithm to cl	 * mm_cid remote-clear implements a lock-free algorithm to cl
 * concurrently with respect to the execution of the source r	 * concurrently with respect to the execution of the source r
 * switch.							 * switch.
 *								 *
 * There is one basic properties we want to guarantee here:	 * There is one basic properties we want to guarantee here:
 *								 *
 * (1) Remote-clear should _never_ mark a per-cpu cid UNSET w	 * (1) Remote-clear should _never_ mark a per-cpu cid UNSET w
 * used by a task. That would lead to concurrent allocation o	 * used by a task. That would lead to concurrent allocation o
 * userspace corruption.					 * userspace corruption.
 *								 *
 * Provide this guarantee by introducing a Dekker memory orde	 * Provide this guarantee by introducing a Dekker memory orde
 * that a pair of loads observe at least one of a pair of sto	 * that a pair of loads observe at least one of a pair of sto
 * shown as:							 * shown as:
 *								 *
 *      X = Y = 0						 *      X = Y = 0
 *								 *
 *      w[X]=1          w[Y]=1					 *      w[X]=1          w[Y]=1
 *      MB              MB					 *      MB              MB
 *      r[Y]=y          r[X]=x					 *      r[Y]=y          r[X]=x
 *								 *
 * Which guarantees that x==0 && y==0 is impossible. But rath	 * Which guarantees that x==0 && y==0 is impossible. But rath
 * values 0 and 1, this algorithm cares about specific state 	 * values 0 and 1, this algorithm cares about specific state 
 * runqueue current task (as updated by the scheduler context	 * runqueue current task (as updated by the scheduler context
 * per-mm/cpu cid value.					 * per-mm/cpu cid value.
 *								 *
 * Let's introduce task (Y) which has task->mm == mm and task	 * Let's introduce task (Y) which has task->mm == mm and task
 * task->mm != mm for the rest of the discussion. There are t	 * task->mm != mm for the rest of the discussion. There are t
 * transitions on context switch we care about:			 * transitions on context switch we care about:
 *								 *
 * (TSA) Store to rq->curr with transition from (N) to (Y)	 * (TSA) Store to rq->curr with transition from (N) to (Y)
 *								 *
 * (TSB) Store to rq->curr with transition from (Y) to (N)	 * (TSB) Store to rq->curr with transition from (Y) to (N)
 *								 *
 * On the remote-clear side, there is one transition we care 	 * On the remote-clear side, there is one transition we care 
 *								 *
 * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag		 * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
 *								 *
 * There is also a transition to UNSET state which can be per	 * There is also a transition to UNSET state which can be per
 * sides (scheduler, remote-clear). It is always performed wi	 * sides (scheduler, remote-clear). It is always performed wi
 * guarantees that only a single thread will succeed:		 * guarantees that only a single thread will succeed:
 *								 *
 * (TMB) cmpxchg to *pcpu_cid to mark UNSET			 * (TMB) cmpxchg to *pcpu_cid to mark UNSET
 *								 *
 * Just to be clear, what we do _not_ want to happen is a tra	 * Just to be clear, what we do _not_ want to happen is a tra
 * when a thread is actively using the cid (property (1)).	 * when a thread is actively using the cid (property (1)).
 *								 *
 * Let's looks at the relevant combinations of TSA/TSB, and T	 * Let's looks at the relevant combinations of TSA/TSB, and T
 *								 *
 * Scenario A) (TSA)+(TMA) (from next task perspective)		 * Scenario A) (TSA)+(TMA) (from next task perspective)
 *								 *
 * CPU0                                      CPU1		 * CPU0                                      CPU1
 *								 *
 * Context switch CS-1                       Remote-clear	 * Context switch CS-1                       Remote-clear
 *   - store to rq->curr: (N)->(Y) (TSA)     - cmpxchg to *pc	 *   - store to rq->curr: (N)->(Y) (TSA)     - cmpxchg to *pc
 *                                             (implied barri	 *                                             (implied barri
 *   - switch_mm_cid()						 *   - switch_mm_cid()
 *     - memory barrier (see switch_mm_cid()			 *     - memory barrier (see switch_mm_cid()
 *       comment explaining how this barrier			 *       comment explaining how this barrier
 *       is combined with other scheduler			 *       is combined with other scheduler
 *       barriers)						 *       barriers)
 *     - mm_cid_get (next)					 *     - mm_cid_get (next)
 *       - READ_ONCE(*pcpu_cid)              - rcu_dereferenc	 *       - READ_ONCE(*pcpu_cid)              - rcu_dereferenc
 *								 *
 * This Dekker ensures that either task (Y) is observed by th	 * This Dekker ensures that either task (Y) is observed by th
 * rcu_dereference() or the LAZY flag is observed by READ_ONC	 * rcu_dereference() or the LAZY flag is observed by READ_ONC
 * observed.							 * observed.
 *								 *
 * If task (Y) store is observed by rcu_dereference(), it mea	 * If task (Y) store is observed by rcu_dereference(), it mea
 * still an active task on the cpu. Remote-clear will therefo	 * still an active task on the cpu. Remote-clear will therefo
 * to UNSET, which fulfills property (1).			 * to UNSET, which fulfills property (1).
 *								 *
 * If task (Y) is not observed, but the lazy flag is observed	 * If task (Y) is not observed, but the lazy flag is observed
 * it will move its state to UNSET, which clears the percpu c	 * it will move its state to UNSET, which clears the percpu c
 * uselessly (which is not an issue for correctness). Because	 * uselessly (which is not an issue for correctness). Because
 * observed, CPU1 can move ahead to set the state to UNSET. B	 * observed, CPU1 can move ahead to set the state to UNSET. B
 * state to UNSET is done with a cmpxchg expecting that the o	 * state to UNSET is done with a cmpxchg expecting that the o
 * LAZY flag set, only one thread will successfully UNSET.	 * LAZY flag set, only one thread will successfully UNSET.
 *								 *
 * If both states (LAZY flag and task (Y)) are observed, the 	 * If both states (LAZY flag and task (Y)) are observed, the 
 * will observe the LAZY flag and transition to UNSET (perhap	 * will observe the LAZY flag and transition to UNSET (perhap
 * CPU1 will observe task (Y) and do nothing more, which is f	 * CPU1 will observe task (Y) and do nothing more, which is f
 *								 *
 * What we are effectively preventing with this Dekker is a s	 * What we are effectively preventing with this Dekker is a s
 * neither LAZY flag nor store (Y) are observed, which would 	 * neither LAZY flag nor store (Y) are observed, which would 
 * because this would UNSET a cid which is actively used.	 * because this would UNSET a cid which is actively used.
 */								 */

void sched_mm_cid_migrate_from(struct task_struct *t)		void sched_mm_cid_migrate_from(struct task_struct *t)
{								{
	t->migrate_from_cpu = task_cpu(t);				t->migrate_from_cpu = task_cpu(t);
}								}

static								static
int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,	int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
					  struct task_struct 						  struct task_struct 
					  struct mm_cid *src_						  struct mm_cid *src_
{								{
	struct mm_struct *mm = t->mm;					struct mm_struct *mm = t->mm;
	struct task_struct *src_task;					struct task_struct *src_task;
	int src_cid, last_mm_cid;					int src_cid, last_mm_cid;

	if (!mm)							if (!mm)
		return -1;							return -1;

	last_mm_cid = t->last_mm_cid;					last_mm_cid = t->last_mm_cid;
	/*								/*
	 * If the migrated task has no last cid, or if the cu		 * If the migrated task has no last cid, or if the cu
	 * task on src rq uses the cid, it means the source c		 * task on src rq uses the cid, it means the source c
	 * to be moved to the destination cpu.				 * to be moved to the destination cpu.
	 */								 */
	if (last_mm_cid == -1)						if (last_mm_cid == -1)
		return -1;							return -1;
	src_cid = READ_ONCE(src_pcpu_cid->cid);				src_cid = READ_ONCE(src_pcpu_cid->cid);
	if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_c		if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_c
		return -1;							return -1;

	/*								/*
	 * If we observe an active task using the mm on this 		 * If we observe an active task using the mm on this 
	 * are not the last task to be migrated from this cpu		 * are not the last task to be migrated from this cpu
	 * there is no need to move src_cid to the destinatio		 * there is no need to move src_cid to the destinatio
	 */								 */
	rcu_read_lock();						rcu_read_lock();
	src_task = rcu_dereference(src_rq->curr);			src_task = rcu_dereference(src_rq->curr);
	if (READ_ONCE(src_task->mm_cid_active) && src_task->m		if (READ_ONCE(src_task->mm_cid_active) && src_task->m
		rcu_read_unlock();						rcu_read_unlock();
		t->last_mm_cid = -1;						t->last_mm_cid = -1;
		return -1;							return -1;
	}								}
	rcu_read_unlock();						rcu_read_unlock();

	return src_cid;							return src_cid;
}								}

static								static
int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_	int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_
					      struct task_str						      struct task_str
					      struct mm_cid *						      struct mm_cid *
					      int src_cid)						      int src_cid)
{								{
	struct task_struct *src_task;					struct task_struct *src_task;
	struct mm_struct *mm = t->mm;					struct mm_struct *mm = t->mm;
	int lazy_cid;							int lazy_cid;

	if (src_cid == -1)						if (src_cid == -1)
		return -1;							return -1;

	/*								/*
	 * Attempt to clear the source cpu cid to move it to 		 * Attempt to clear the source cpu cid to move it to 
	 * cpu.								 * cpu.
	 */								 */
	lazy_cid = mm_cid_set_lazy_put(src_cid);			lazy_cid = mm_cid_set_lazy_put(src_cid);
	if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_c		if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_c
		return -1;							return -1;

	/*								/*
	 * The implicit barrier after cmpxchg per-mm/cpu cid 		 * The implicit barrier after cmpxchg per-mm/cpu cid 
	 * rq->curr->mm matches the scheduler barrier in cont		 * rq->curr->mm matches the scheduler barrier in cont
	 * between store to rq->curr and load of prev and nex		 * between store to rq->curr and load of prev and nex
	 * per-mm/cpu cid.						 * per-mm/cpu cid.
	 *								 *
	 * The implicit barrier after cmpxchg per-mm/cpu cid 		 * The implicit barrier after cmpxchg per-mm/cpu cid 
	 * rq->curr->mm_cid_active matches the barrier in		 * rq->curr->mm_cid_active matches the barrier in
	 * sched_mm_cid_exit_signals(), sched_mm_cid_before_e		 * sched_mm_cid_exit_signals(), sched_mm_cid_before_e
	 * sched_mm_cid_after_execve() between store to t->mm		 * sched_mm_cid_after_execve() between store to t->mm
	 * load of per-mm/cpu cid.					 * load of per-mm/cpu cid.
	 */								 */

	/*								/*
	 * If we observe an active task using the mm on this 		 * If we observe an active task using the mm on this 
	 * the lazy-put flag, this task will be responsible f		 * the lazy-put flag, this task will be responsible f
	 * from lazy-put flag set to MM_CID_UNSET.			 * from lazy-put flag set to MM_CID_UNSET.
	 */								 */
	rcu_read_lock();						rcu_read_lock();
	src_task = rcu_dereference(src_rq->curr);			src_task = rcu_dereference(src_rq->curr);
	if (READ_ONCE(src_task->mm_cid_active) && src_task->m		if (READ_ONCE(src_task->mm_cid_active) && src_task->m
		rcu_read_unlock();						rcu_read_unlock();
		/*								/*
		 * We observed an active task for this mm, th			 * We observed an active task for this mm, th
		 * no point in moving this cid to the destina			 * no point in moving this cid to the destina
		 */								 */
		t->last_mm_cid = -1;						t->last_mm_cid = -1;
		return -1;							return -1;
	}								}
	rcu_read_unlock();						rcu_read_unlock();

	/*								/*
	 * The src_cid is unused, so it can be unset.			 * The src_cid is unused, so it can be unset.
	 */								 */
	if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CI		if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CI
		return -1;							return -1;
	return src_cid;							return src_cid;
}								}

/*								/*
 * Migration to dst cpu. Called with dst_rq lock held.		 * Migration to dst cpu. Called with dst_rq lock held.
 * Interrupts are disabled, which keeps the window of cid own	 * Interrupts are disabled, which keeps the window of cid own
 * source rq lock held small.					 * source rq lock held small.
 */								 */
void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_s	void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_s
{								{
	struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;			struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
	struct mm_struct *mm = t->mm;					struct mm_struct *mm = t->mm;
	int src_cid, dst_cid, src_cpu;					int src_cid, dst_cid, src_cpu;
	struct rq *src_rq;						struct rq *src_rq;

	lockdep_assert_rq_held(dst_rq);					lockdep_assert_rq_held(dst_rq);

	if (!mm)							if (!mm)
		return;								return;
	src_cpu = t->migrate_from_cpu;					src_cpu = t->migrate_from_cpu;
	if (src_cpu == -1) {						if (src_cpu == -1) {
		t->last_mm_cid = -1;						t->last_mm_cid = -1;
		return;								return;
	}								}
	/*								/*
	 * Move the src cid if the dst cid is unset. This kee		 * Move the src cid if the dst cid is unset. This kee
	 * allocation closest to 0 in cases where few threads		 * allocation closest to 0 in cases where few threads
	 * many cpus.							 * many cpus.
	 *								 *
	 * If destination cid is already set, we may have to 		 * If destination cid is already set, we may have to 
	 * the src cid to ensure compactness in frequent migr		 * the src cid to ensure compactness in frequent migr
	 * scenarios.							 * scenarios.
	 *								 *
	 * It is not useful to clear the src cid when the num		 * It is not useful to clear the src cid when the num
	 * greater or equal to the number of allowed cpus, be		 * greater or equal to the number of allowed cpus, be
	 * can expect that the number of allowed cids can rea		 * can expect that the number of allowed cids can rea
	 * allowed cpus.						 * allowed cpus.
	 */								 */
	dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_r		dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_r
	dst_cid = READ_ONCE(dst_pcpu_cid->cid);				dst_cid = READ_ONCE(dst_pcpu_cid->cid);
	if (!mm_cid_is_unset(dst_cid) &&				if (!mm_cid_is_unset(dst_cid) &&
	    atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)		    atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
		return;								return;
	src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);		src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
	src_rq = cpu_rq(src_cpu);					src_rq = cpu_rq(src_cpu);
	src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_r		src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_r
	if (src_cid == -1)						if (src_cid == -1)
		return;								return;
	src_cid = __sched_mm_cid_migrate_from_try_steal_cid(s		src_cid = __sched_mm_cid_migrate_from_try_steal_cid(s
							    s								    s
	if (src_cid == -1)						if (src_cid == -1)
		return;								return;
	if (!mm_cid_is_unset(dst_cid)) {				if (!mm_cid_is_unset(dst_cid)) {
		__mm_cid_put(mm, src_cid);					__mm_cid_put(mm, src_cid);
		return;								return;
	}								}
	/* Move src_cid to dst cpu. */					/* Move src_cid to dst cpu. */
	mm_cid_snapshot_time(dst_rq, mm);				mm_cid_snapshot_time(dst_rq, mm);
	WRITE_ONCE(dst_pcpu_cid->cid, src_cid);				WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
}								}

static void sched_mm_cid_remote_clear(struct mm_struct *mm, s	static void sched_mm_cid_remote_clear(struct mm_struct *mm, s
				      int cpu)							      int cpu)
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct task_struct *t;						struct task_struct *t;
	unsigned long flags;						unsigned long flags;
	int cid, lazy_cid;						int cid, lazy_cid;

	cid = READ_ONCE(pcpu_cid->cid);					cid = READ_ONCE(pcpu_cid->cid);
	if (!mm_cid_is_valid(cid))					if (!mm_cid_is_valid(cid))
		return;								return;

	/*								/*
	 * Clear the cpu cid if it is set to keep cid allocat		 * Clear the cpu cid if it is set to keep cid allocat
	 * there happens to be other tasks left on the source		 * there happens to be other tasks left on the source
	 * mm, the next task using this mm will reallocate it		 * mm, the next task using this mm will reallocate it
	 * switch.							 * switch.
	 */								 */
	lazy_cid = mm_cid_set_lazy_put(cid);				lazy_cid = mm_cid_set_lazy_put(cid);
	if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))		if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
		return;								return;

	/*								/*
	 * The implicit barrier after cmpxchg per-mm/cpu cid 		 * The implicit barrier after cmpxchg per-mm/cpu cid 
	 * rq->curr->mm matches the scheduler barrier in cont		 * rq->curr->mm matches the scheduler barrier in cont
	 * between store to rq->curr and load of prev and nex		 * between store to rq->curr and load of prev and nex
	 * per-mm/cpu cid.						 * per-mm/cpu cid.
	 *								 *
	 * The implicit barrier after cmpxchg per-mm/cpu cid 		 * The implicit barrier after cmpxchg per-mm/cpu cid 
	 * rq->curr->mm_cid_active matches the barrier in		 * rq->curr->mm_cid_active matches the barrier in
	 * sched_mm_cid_exit_signals(), sched_mm_cid_before_e		 * sched_mm_cid_exit_signals(), sched_mm_cid_before_e
	 * sched_mm_cid_after_execve() between store to t->mm		 * sched_mm_cid_after_execve() between store to t->mm
	 * load of per-mm/cpu cid.					 * load of per-mm/cpu cid.
	 */								 */

	/*								/*
	 * If we observe an active task using the mm on this 		 * If we observe an active task using the mm on this 
	 * the lazy-put flag, that task will be responsible f		 * the lazy-put flag, that task will be responsible f
	 * from lazy-put flag set to MM_CID_UNSET.			 * from lazy-put flag set to MM_CID_UNSET.
	 */								 */
	rcu_read_lock();						rcu_read_lock();
	t = rcu_dereference(rq->curr);					t = rcu_dereference(rq->curr);
	if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {		if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
		rcu_read_unlock();						rcu_read_unlock();
		return;								return;
	}								}
	rcu_read_unlock();						rcu_read_unlock();

	/*								/*
	 * The cid is unused, so it can be unset.			 * The cid is unused, so it can be unset.
	 * Disable interrupts to keep the window of cid owner		 * Disable interrupts to keep the window of cid owner
	 * lock small.							 * lock small.
	 */								 */
	local_irq_save(flags);						local_irq_save(flags);
	if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNS		if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNS
		__mm_cid_put(mm, cid);						__mm_cid_put(mm, cid);
	local_irq_restore(flags);					local_irq_restore(flags);
}								}

static void sched_mm_cid_remote_clear_old(struct mm_struct *m	static void sched_mm_cid_remote_clear_old(struct mm_struct *m
{								{
	struct rq *rq = cpu_rq(cpu);					struct rq *rq = cpu_rq(cpu);
	struct mm_cid *pcpu_cid;					struct mm_cid *pcpu_cid;
	struct task_struct *curr;					struct task_struct *curr;
	u64 rq_clock;							u64 rq_clock;

	/*								/*
	 * rq->clock load is racy on 32-bit but one spurious 		 * rq->clock load is racy on 32-bit but one spurious 
	 * while is irrelevant.						 * while is irrelevant.
	 */								 */
	rq_clock = READ_ONCE(rq->clock);				rq_clock = READ_ONCE(rq->clock);
	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);			pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);

	/*								/*
	 * In order to take care of infrequently scheduled ta		 * In order to take care of infrequently scheduled ta
	 * snapshot associated with this cid if an active tas		 * snapshot associated with this cid if an active tas
	 * observed on this rq.						 * observed on this rq.
	 */								 */
	rcu_read_lock();						rcu_read_lock();
	curr = rcu_dereference(rq->curr);				curr = rcu_dereference(rq->curr);
	if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm)		if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm)
		WRITE_ONCE(pcpu_cid->time, rq_clock);				WRITE_ONCE(pcpu_cid->time, rq_clock);
		rcu_read_unlock();						rcu_read_unlock();
		return;								return;
	}								}
	rcu_read_unlock();						rcu_read_unlock();

	if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_N		if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_N
		return;								return;
	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);			sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
}								}

static void sched_mm_cid_remote_clear_weight(struct mm_struct	static void sched_mm_cid_remote_clear_weight(struct mm_struct
					     int weight)						     int weight)
{								{
	struct mm_cid *pcpu_cid;					struct mm_cid *pcpu_cid;
	int cid;							int cid;

	pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);			pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
	cid = READ_ONCE(pcpu_cid->cid);					cid = READ_ONCE(pcpu_cid->cid);
	if (!mm_cid_is_valid(cid) || cid < weight)			if (!mm_cid_is_valid(cid) || cid < weight)
		return;								return;
	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);			sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
}								}

static void task_mm_cid_work(struct callback_head *work)	static void task_mm_cid_work(struct callback_head *work)
{								{
	unsigned long now = jiffies, old_scan, next_scan;		unsigned long now = jiffies, old_scan, next_scan;
	struct task_struct *t = current;				struct task_struct *t = current;
	struct cpumask *cidmask;					struct cpumask *cidmask;
	struct mm_struct *mm;						struct mm_struct *mm;
	int weight, cpu;						int weight, cpu;

	SCHED_WARN_ON(t != container_of(work, struct task_str		SCHED_WARN_ON(t != container_of(work, struct task_str

	work->next = work;	/* Prevent double-add */		work->next = work;	/* Prevent double-add */
	if (t->flags & PF_EXITING)					if (t->flags & PF_EXITING)
		return;								return;
	mm = t->mm;							mm = t->mm;
	if (!mm)							if (!mm)
		return;								return;
	old_scan = READ_ONCE(mm->mm_cid_next_scan);			old_scan = READ_ONCE(mm->mm_cid_next_scan);
	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY)		next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY)
	if (!old_scan) {						if (!old_scan) {
		unsigned long res;						unsigned long res;

		res = cmpxchg(&mm->mm_cid_next_scan, old_scan			res = cmpxchg(&mm->mm_cid_next_scan, old_scan
		if (res != old_scan)						if (res != old_scan)
			old_scan = res;							old_scan = res;
		else								else
			old_scan = next_scan;						old_scan = next_scan;
	}								}
	if (time_before(now, old_scan))					if (time_before(now, old_scan))
		return;								return;
	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, ne		if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, ne
		return;								return;
	cidmask = mm_cidmask(mm);					cidmask = mm_cidmask(mm);
	/* Clear cids that were not recently used. */			/* Clear cids that were not recently used. */
	for_each_possible_cpu(cpu)					for_each_possible_cpu(cpu)
		sched_mm_cid_remote_clear_old(mm, cpu);				sched_mm_cid_remote_clear_old(mm, cpu);
	weight = cpumask_weight(cidmask);				weight = cpumask_weight(cidmask);
	/*								/*
	 * Clear cids that are greater or equal to the cidmas		 * Clear cids that are greater or equal to the cidmas
	 * recompact it.						 * recompact it.
	 */								 */
	for_each_possible_cpu(cpu)					for_each_possible_cpu(cpu)
		sched_mm_cid_remote_clear_weight(mm, cpu, wei			sched_mm_cid_remote_clear_weight(mm, cpu, wei
}								}

void init_sched_mm_cid(struct task_struct *t)			void init_sched_mm_cid(struct task_struct *t)
{								{
	struct mm_struct *mm = t->mm;					struct mm_struct *mm = t->mm;
	int mm_users = 0;						int mm_users = 0;

	if (mm) {							if (mm) {
		mm_users = atomic_read(&mm->mm_users);				mm_users = atomic_read(&mm->mm_users);
		if (mm_users == 1)						if (mm_users == 1)
			mm->mm_cid_next_scan = jiffies + msec				mm->mm_cid_next_scan = jiffies + msec
	}								}
	t->cid_work.next = &t->cid_work;	/* Protect ag		t->cid_work.next = &t->cid_work;	/* Protect ag
	init_task_work(&t->cid_work, task_mm_cid_work);			init_task_work(&t->cid_work, task_mm_cid_work);
}								}

void task_tick_mm_cid(struct rq *rq, struct task_struct *curr	void task_tick_mm_cid(struct rq *rq, struct task_struct *curr
{								{
	struct callback_head *work = &curr->cid_work;			struct callback_head *work = &curr->cid_work;
	unsigned long now = jiffies;					unsigned long now = jiffies;

	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHR		if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHR
	    work->next != work)						    work->next != work)
		return;								return;
	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_		if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_
		return;								return;
	task_work_add(curr, work, TWA_RESUME);				task_work_add(curr, work, TWA_RESUME);
}								}

void sched_mm_cid_exit_signals(struct task_struct *t)		void sched_mm_cid_exit_signals(struct task_struct *t)
{								{
	struct mm_struct *mm = t->mm;					struct mm_struct *mm = t->mm;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	if (!mm)							if (!mm)
		return;								return;

	preempt_disable();						preempt_disable();
	rq = this_rq();							rq = this_rq();
	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	preempt_enable_no_resched();	/* holding spinlock *		preempt_enable_no_resched();	/* holding spinlock *
	WRITE_ONCE(t->mm_cid_active, 0);				WRITE_ONCE(t->mm_cid_active, 0);
	/*								/*
	 * Store t->mm_cid_active before loading per-mm/cpu c		 * Store t->mm_cid_active before loading per-mm/cpu c
	 * Matches barrier in sched_mm_cid_remote_clear_old()		 * Matches barrier in sched_mm_cid_remote_clear_old()
	 */								 */
	smp_mb();							smp_mb();
	mm_cid_put(mm);							mm_cid_put(mm);
	t->last_mm_cid = t->mm_cid = -1;				t->last_mm_cid = t->mm_cid = -1;
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);
}								}

void sched_mm_cid_before_execve(struct task_struct *t)		void sched_mm_cid_before_execve(struct task_struct *t)
{								{
	struct mm_struct *mm = t->mm;					struct mm_struct *mm = t->mm;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	if (!mm)							if (!mm)
		return;								return;

	preempt_disable();						preempt_disable();
	rq = this_rq();							rq = this_rq();
	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	preempt_enable_no_resched();	/* holding spinlock *		preempt_enable_no_resched();	/* holding spinlock *
	WRITE_ONCE(t->mm_cid_active, 0);				WRITE_ONCE(t->mm_cid_active, 0);
	/*								/*
	 * Store t->mm_cid_active before loading per-mm/cpu c		 * Store t->mm_cid_active before loading per-mm/cpu c
	 * Matches barrier in sched_mm_cid_remote_clear_old()		 * Matches barrier in sched_mm_cid_remote_clear_old()
	 */								 */
	smp_mb();							smp_mb();
	mm_cid_put(mm);							mm_cid_put(mm);
	t->last_mm_cid = t->mm_cid = -1;				t->last_mm_cid = t->mm_cid = -1;
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);
}								}

void sched_mm_cid_after_execve(struct task_struct *t)		void sched_mm_cid_after_execve(struct task_struct *t)
{								{
	struct mm_struct *mm = t->mm;					struct mm_struct *mm = t->mm;
	struct rq_flags rf;						struct rq_flags rf;
	struct rq *rq;							struct rq *rq;

	if (!mm)							if (!mm)
		return;								return;

	preempt_disable();						preempt_disable();
	rq = this_rq();							rq = this_rq();
	rq_lock_irqsave(rq, &rf);					rq_lock_irqsave(rq, &rf);
	preempt_enable_no_resched();	/* holding spinlock *		preempt_enable_no_resched();	/* holding spinlock *
	WRITE_ONCE(t->mm_cid_active, 1);				WRITE_ONCE(t->mm_cid_active, 1);
	/*								/*
	 * Store t->mm_cid_active before loading per-mm/cpu c		 * Store t->mm_cid_active before loading per-mm/cpu c
	 * Matches barrier in sched_mm_cid_remote_clear_old()		 * Matches barrier in sched_mm_cid_remote_clear_old()
	 */								 */
	smp_mb();							smp_mb();
	t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);		t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
	rq_unlock_irqrestore(rq, &rf);					rq_unlock_irqrestore(rq, &rf);
	rseq_set_notify_resume(t);					rseq_set_notify_resume(t);
}								}

void sched_mm_cid_fork(struct task_struct *t)			void sched_mm_cid_fork(struct task_struct *t)
{								{
	WARN_ON_ONCE(!t->mm || t->mm_cid != -1);			WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
	t->mm_cid_active = 1;						t->mm_cid_active = 1;
}								}
#endif							      /	#endif