sst-linux/include/linux/sched/task.h
Shakeel Butt 28e51dd4f2 cgroup: fix race between fork and cgroup.kill
commit b69bb476dee99d564d65d418e9a20acca6f32c3f upstream.

Tejun reported the following race between fork() and cgroup.kill at [1].

Tejun:
  I was looking at cgroup.kill implementation and wondering whether there
  could be a race window. So, __cgroup_kill() does the following:

   k1. Set CGRP_KILL.
   k2. Iterate tasks and deliver SIGKILL.
   k3. Clear CGRP_KILL.

  The copy_process() does the following:

   c1. Copy a bunch of stuff.
   c2. Grab siglock.
   c3. Check fatal_signal_pending().
   c4. Commit to forking.
   c5. Release siglock.
   c6. Call cgroup_post_fork() which puts the task on the css_set and tests
       CGRP_KILL.

  The intention seems to be that either a forking task gets SIGKILL and
  terminates on c3 or it sees CGRP_KILL on c6 and kills the child. However, I
  don't see what guarantees that k3 can't happen before c6. ie. After a
  forking task passes c5, k2 can take place and then before the forking task
  reaches c6, k3 can happen. Then, nobody would send SIGKILL to the child.
  What am I missing?

This is indeed a race. One way to fix this race is by taking
cgroup_threadgroup_rwsem in write mode in __cgroup_kill() as the fork()
side takes cgroup_threadgroup_rwsem in read mode from cgroup_can_fork()
to cgroup_post_fork(). However that would be heavy handed as this adds
one more potential stall scenario for cgroup.kill which is usually
called under extreme situation like memory pressure.

To fix this race, let's maintain a sequence number per cgroup which gets
incremented on __cgroup_kill() call. On the fork() side, the
cgroup_can_fork() will cache the sequence number locally and recheck it
against the cgroup's sequence number at cgroup_post_fork() site. If the
sequence numbers mismatch, it means __cgroup_kill() can been called and
we should send SIGKILL to the newly created task.

Reported-by: Tejun Heo <tj@kernel.org>
Closes: https://lore.kernel.org/all/Z5QHE2Qn-QZ6M-KW@slm.duckdns.org/ [1]
Fixes: 661ee62809 ("cgroup: introduce cgroup.kill")
Cc: stable@vger.kernel.org # v5.14+
Signed-off-by: Shakeel Butt <shakeel.butt@linux.dev>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2025-02-21 13:50:04 +01:00

215 lines
5.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_SCHED_TASK_H
#define _LINUX_SCHED_TASK_H
/*
* Interface between the scheduler and various task lifetime (fork()/exit())
* functionality:
*/
#include <linux/sched.h>
#include <linux/uaccess.h>
struct task_struct;
struct rusage;
union thread_union;
struct css_set;
/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL
struct kernel_clone_args {
u64 flags;
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
int exit_signal;
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
int io_thread;
int kthread;
int idle;
int (*fn)(void *);
void *fn_arg;
struct cgroup *cgrp;
struct css_set *cset;
unsigned int kill_seq;
};
/*
* This serializes "schedule()" and also protects
* the run-queue from deletions/modifications (but
* _adding_ to the beginning of the run-queue has
* a separate lock).
*/
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;
extern union thread_union init_thread_union;
extern struct task_struct init_task;
extern int lockdep_tasklist_lock_is_held(void);
extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs);
extern void sched_post_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);
void __noreturn do_task_dead(void);
void __noreturn make_task_dead(int signr);
extern void mm_cache_init(void);
extern void proc_caches_init(void);
extern void fork_init(void);
extern void release_task(struct task_struct * p);
extern int copy_thread(struct task_struct *, const struct kernel_clone_args *);
extern void flush_thread(void);
#ifdef CONFIG_HAVE_EXIT_THREAD
extern void exit_thread(struct task_struct *tsk);
#else
static inline void exit_thread(struct task_struct *tsk)
{
}
#endif
extern __noreturn void do_group_exit(int);
extern void exit_files(struct task_struct *);
extern void exit_itimers(struct task_struct *);
extern pid_t kernel_clone(struct kernel_clone_args *kargs);
struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
struct task_struct *fork_idle(int);
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags);
extern long kernel_wait4(pid_t, int __user *, int, struct rusage *);
int kernel_wait(pid_t pid, int *stat);
extern void free_task(struct task_struct *tsk);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec() {}
#endif
static inline struct task_struct *get_task_struct(struct task_struct *t)
{
refcount_inc(&t->usage);
return t;
}
extern void __put_task_struct(struct task_struct *t);
extern void __put_task_struct_rcu_cb(struct rcu_head *rhp);
static inline void put_task_struct(struct task_struct *t)
{
if (!refcount_dec_and_test(&t->usage))
return;
/*
* under PREEMPT_RT, we can't call put_task_struct
* in atomic context because it will indirectly
* acquire sleeping locks.
*
* call_rcu() will schedule delayed_put_task_struct_rcu()
* to be called in process context.
*
* __put_task_struct() is called when
* refcount_dec_and_test(&t->usage) succeeds.
*
* This means that it can't "conflict" with
* put_task_struct_rcu_user() which abuses ->rcu the same
* way; rcu_users has a reference so task->usage can't be
* zero after rcu_users 1 -> 0 transition.
*
* delayed_free_task() also uses ->rcu, but it is only called
* when it fails to fork a process. Therefore, there is no
* way it can conflict with put_task_struct().
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) && !preemptible())
call_rcu(&t->rcu, __put_task_struct_rcu_cb);
else
__put_task_struct(t);
}
DEFINE_FREE(put_task, struct task_struct *, if (_T) put_task_struct(_T))
static inline void put_task_struct_many(struct task_struct *t, int nr)
{
if (refcount_sub_and_test(nr, &t->usage))
__put_task_struct(t);
}
void put_task_struct_rcu_user(struct task_struct *task);
/* Free all architecture-specific resources held by a thread. */
void release_thread(struct task_struct *dead_task);
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
extern int arch_task_struct_size __read_mostly;
#else
# define arch_task_struct_size (sizeof(struct task_struct))
#endif
#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST
/*
* If an architecture has not declared a thread_struct whitelist we
* must assume something there may need to be copied to userspace.
*/
static inline void arch_thread_struct_whitelist(unsigned long *offset,
unsigned long *size)
{
*offset = 0;
/* Handle dynamically sized thread_struct. */
*size = arch_task_struct_size - offsetof(struct task_struct, thread);
}
#endif
#ifdef CONFIG_VMAP_STACK
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
return t->stack_vm_area;
}
#else
static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t)
{
return NULL;
}
#endif
/*
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context. Also protects ->cpuset and
* ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
* neither inside nor outside.
*/
static inline void task_lock(struct task_struct *p)
{
spin_lock(&p->alloc_lock);
}
static inline void task_unlock(struct task_struct *p)
{
spin_unlock(&p->alloc_lock);
}
#endif /* _LINUX_SCHED_TASK_H */