一、概述
系统进入suspended或进程被加入到cgroup冻结或解冻分组,用户进程和部分内核线程被冻结后,会剥夺执行cpu资源,解冻或唤醒后恢复正常。
二、进程冻结与解冻原理
2.1 进程冻结
用户进程和内核线程冻结的基本流程:
内核态线程可直接唤醒执行冻结操作,用户进程则需要在其返回到用户态时才能执行冻结操作。这是因为若其在内核态执行时被冻结,若其正好持有一些锁,则可能会导致死锁。为此,内核通过了一个比较巧妙的方式实现了冻结流程。它首先为该进程设置了一个信号pending标志TIF_SIGPENDING,但并不向该进程发送实际的信号,然后通过ipi唤醒该进程执行。由于ipi会进行进程内核的中断处理流程,当其处理完成后,会调用ret_to_user函数返回用户态,而该函数会调用信号处理函数检查是否有pending的中断需要处理,由于先前已经设置了信号的pending标志,因此会执行信号处理流程。此时,会发现进程冻结相关的全局变量已设置,故进程将执行冻结流程。
2.1.1 冻结用户进程
int freeze_processes(void)
{
int error;
error = __usermodehelper_disable(UMH_FREEZING);
if (error)
return error;
/* Make sure this task doesn't get frozen */
current->flags |= PF_SUSPEND_TASK;
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
pm_wakeup_clear(0);
pr_info("%s:Freezing user space processes ... ", STR_KERNEL_LOG_ENTER);
pm_freezing = true;
// true表示用户进程
error = try_to_freeze_tasks(true);
if (!error) {
__usermodehelper_set_disable_depth(UMH_DISABLED);
pr_cont("done.");
}
pr_cont("\n");
BUG_ON(in_atomic());
/*
* Now that the whole userspace is frozen we need to disable
* the OOM killer to disallow any further interference with
* killable tasks. There is no guarantee oom victims will
* ever reach a point they go away we have to wait with a timeout.
*/
if (!error && !oom_killer_disable(msecs_to_jiffies(freeze_timeout_msecs)))
error = -EBUSY;
if (error)
thaw_processes();
return error;
}
static int try_to_freeze_tasks(bool user_only) {
if (!user_only)
freeze_workqueues_begin();
while (true) {
todo = 0;
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
if (p == current || !freeze_task(p))
continue;
......
}
......
}
bool freeze_task(struct task_struct *p)
{
unsigned long flags;
/*
* This check can race with freezer_do_not_count, but worst case that
* will result in an extra wakeup being sent to the task. It does not
* race with freezer_count(), the barriers in freezer_count() and
* freezer_should_skip() ensure that either freezer_count() sees
* freezing == true in try_to_freeze() and freezes, or
* freezer_should_skip() sees !PF_FREEZE_SKIP and freezes the task
* normally.
*/
// 若进程设置了PF_FREEZER_SKIP,则不能冻结
if (freezer_should_skip(p))
return false;
spin_lock_irqsave(&freezer_lock, flags);
if (!freezing(p) || frozen(p)) {
spin_unlock_irqrestore(&freezer_lock, flags);
return false;
}
// 如果是用户进程,需要先发送伪信号,当进程返回用户空间时处理信号过程中被冻结,因为若其在内核态执行时被冻结,若其正好持有一些锁,则可能会导致死锁
// 如果是内核线程,直接冻结,并将状态设置为TASK_INTERRUPTIBLE
if (!(p->flags & PF_KTHREAD))
fake_signal_wake_up(p);
else
wake_up_state(p, TASK_INTERRUPTIBLE);
spin_unlock_irqrestore(&freezer_lock, flags);
return true;
}
// kernel/kernel/signal.c
oid signal_wake_up_state(struct task_struct *t, unsigned int state)
{
// 设置TIF_SIGPENDING信号,在get_signal函数中获取处理
set_tsk_thread_flag(t, TIF_SIGPENDING);
/*
* TASK_WAKEKILL also means wake it up in the stopped/traced/killable
* case. We don't check t->state here because there is a race with it
* executing another processor and just now entering stopped state.
* By using wake_up_state, we ensure the process will wake up and
* handle its death signal.
*/
if (!wake_up_state(t, state | TASK_INTERRUPTIBLE))
kick_process(t);
}
bool get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
int signr;
if (unlikely(uprobe_deny_signal()))
return false;
/*
* Do this once, we can't return to user-mode if freezing() == T.
* do_signal_stop() and ptrace_stop() do freezable_schedule() and
* thus do not need another check after return.
*/
try_to_freeze();
......
}
static inline bool try_to_freeze(void)
{
if (!(current->flags & PF_NOFREEZE))
debug_check_no_locks_held();
return try_to_freeze_unsafe();
}
static inline bool try_to_freeze_unsafe(void)
{
might_sleep();
if (likely(!freezing(current)))
return false;
return __refrigerator(false);
}
2.1.2 冻结内核线程
int freeze_kernel_threads(void)
{
int error;
pr_info("%s:Freezing remaining freezable tasks ... ", STR_KERNEL_LOG_ENTER);
pm_nosig_freezing = true;
// 传入false表示内核线程,代码流程同2.1.1中try_to_freeze_tasks
error = try_to_freeze_tasks(false);
if (!error)
pr_cont("done.");
pr_cont("\n");
BUG_ON(in_atomic());
if (error)
thaw_kernel_threads();
return error;
}
最后会调用kthread_freezable_should_stop函数执行内线线程冻结:
bool kthread_freezable_should_stop(bool *was_frozen)
{
…
if (unlikely(freezing(current)))
frozen = __refrigerator(true);
if (was_frozen)
*was_frozen = frozen;
return kthread_should_stop();
}
2.1.3 小结
进程被冻结主要做了以下事情:
1)设置task状态为TASK_UNINTERRUPTIBLE,表示不能加入就绪队列被调度
2)设置task的flag为PF_FROZEN,表示进程已被冻结
3)调用schedule函数,将task从cpu上调度出来,不让其执行cpu,将寄存器堆栈信息保存到thread_info->cpu_context中
4)进程被解冻时,重新被调度,退出for循环,继续往下执行,重新设置task的状态为TASK_RUNNING
bool __refrigerator(bool check_kthr_stop)
{
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
bool was_frozen = false;
long save = current->state;
pr_debug("%s entered refrigerator\n", current->comm);
for (;;) {
// 设置当前task状态为TASK_UNINTERRUPTIBLE
set_current_state(TASK_UNINTERRUPTIBLE);
spin_lock_irq(&freezer_lock);
// 设置当前task的flag为PF_FROZEN,表示已冻结
current->flags |= PF_FROZEN;
if (!freezing(current) ||
(check_kthr_stop && kthread_should_stop()))
current->flags &= ~PF_FROZEN;
trace_android_rvh_refrigerator(pm_nosig_freezing);
spin_unlock_irq(&freezer_lock);
if (!(current->flags & PF_FROZEN))
break;
was_frozen = true;
// 将task从cpu上调度出来,不让其执行cpu,执行schedule函数,会将寄存器堆栈信息保存到thread_info->cpu_context中
// task的上下文保存后,停留在该处,下次被唤醒时,重新被调度,退出for循环,往下执行
schedule();
}
pr_debug("%s left refrigerator\n", current->comm);
/*
* Restore saved task state before returning. The mb'd version
* needs to be used; otherwise, it might silently break
* synchronization which depends on ordered task state change.
*/
// 被唤醒时,重新设置task的状态
set_current_state(save);
return was_frozen;
}
EXPORT_SYMBOL(__refrigerator);
2.2 进程解冻或唤醒
进程解冻会调用调度模块进行进程唤醒,状态设置为runnable或running.
void __thaw_task(struct task_struct *p)
{
unsigned long flags;
const struct cpumask *mask = task_cpu_possible_mask(p);
spin_lock_irqsave(&freezer_lock, flags);
/*
* Wake up frozen tasks. On asymmetric systems where tasks cannot
* run on all CPUs, ttwu() may have deferred a wakeup generated
* before thaw_secondary_cpus() had completed so we generate
* additional wakeups here for tasks in the PF_FREEZER_SKIP state.
*/
if (frozen(p) || (frozen_or_skipped(p) && mask != cpu_possible_mask))
// 调用调度模块唤醒进程
wake_up_process(p);
spin_unlock_irqrestore(&freezer_lock, flags);
}
线程入队操作并标记线程p为runnable状态,线程标记为TASK_RUNNING,并执行唤醒抢占操作。
int wake_up_process(struct task_struct *p)
{
WARN_ON(task_is_stopped_or_traced(p));
return try_to_wake_up(p, TASK_NORMAL, 0);
}
static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
/* 使用内存屏障保证p->on_rq的数值是最新的。如果线程已经在运行队列rq里面了,即进程已经处于
runnable/running状态。ttwu_remote目的是由于线程 p已经在运行队列rq里面了,并且没有完全
取消调度,再次唤醒的话,需要将线程的状态翻转:将状态设置为TASK_RUNNING,这样
线程就一直在运行队列里面了。这种情况则直接退出后续流程,并对调度状态/数据进行统计 */
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
#ifdef CONFIG_SMP
/* 等待在其他cpu上的线程调度完成 */
while (p->on_cpu)
cpu_relax();
/*
* Pairs with the smp_wmb() in finish_lock_switch().
*/
smp_rmb();
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
/* 根据进程的所属的调度类调用相应的回调函数 */
if (p->sched_class->task_waking)
p->sched_class->task_waking(p);
/* 根据线程p相关参数和系统状态,为线程p选择合适的cpu */
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
/* 如果选择的cpu与线程p当前所在的cpu不相同,则将线程的wake_flags设置为需要迁移,然后将线程p迁移到cpu上 */
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
#endif /* CONFIG_SMP */
/* 线程p入队操作并标记线程p为runnable状态,同时唤醒抢占 */
ttwu_queue(p, cpu);
stat:
/* 与调度相关的统计 */
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
static void ttwu_queue(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
}
#endif
raw_spin_lock(&rq->lock);
ttwu_do_activate(rq, p, 0);
raw_spin_unlock(&rq->lock);
}
static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
#endif
//将线程p加入运行队列rq中
ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
//将任务标记为可运行的,并执行唤醒抢占。
ttwu_do_wakeup(rq, p, wake_flags);
}
static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
{
activate_task(rq, p, en_flags);
p->on_rq = TASK_ON_RQ_QUEUED;
/* if a worker is waking up, notify workqueue */
if (p->flags & PF_WQ_WORKER)
wq_worker_waking_up(p, cpu_of(rq));
}
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
update_rq_clock(rq);
sched_info_queued(rq, p);
p->sched_class->enqueue_task(rq, p, flags);
}
void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_contributes_to_load(p))
rq->nr_uninterruptible--;
enqueue_task(rq, p, flags);
}
//将任务标记为可运行的,并执行唤醒抢占操作
static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
check_preempt_curr(rq, p, wake_flags);
trace_sched_wakeup(p, true);
//将线程p的状态设置为TASK_RUNNING
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
u64 max = 2*rq->max_idle_balance_cost;
update_avg(&rq->avg_idle, delta);
if (rq->avg_idle > max)
rq->avg_idle = max;
rq->idle_stamp = 0;
}
#endif
}
/*在增加nr_running之前调用enqueue_task()函数。在这里,将更新公平调度统计数据,然后将线程
p的调度实体放入rbtree红黑树中*/
static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
// 调度实体存入就绪队列
enqueue_entity(cfs_rq, se, flags);
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
*/
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
if (cfs_rq_throttled(cfs_rq))
break;
// 更新cfs队列权重
update_cfs_shares(cfs_rq);
// 更新调度实体的平均负载
update_entity_load_avg(se, 1);
}
if (!se) {
update_rq_runnable_avg(rq, rq->nr_running);
add_nr_running(rq, 1);
}
hrtick_update(rq);
}