From f9c82a4ea89c384d49ce03768ba88d049ed3f1f0 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:08 +0200 Subject: Increase size of ucounts to atomic_long_t RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK use unsigned long to store their counters. As a preparation for moving rlimits based on ucounts, we need to increase the size of the variable to long. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/257aa5fb1a7d81cf0f4c34f39ada2320c4284771.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 11b1596e2542..04c561751af1 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -175,14 +175,14 @@ static void put_ucounts(struct ucounts *ucounts) kfree(ucounts); } -static inline bool atomic_inc_below(atomic_t *v, int u) +static inline bool atomic_long_inc_below(atomic_long_t *v, int u) { - int c, old; - c = atomic_read(v); + long c, old; + c = atomic_long_read(v); for (;;) { if (unlikely(c >= u)) return false; - old = atomic_cmpxchg(v, c, c+1); + old = atomic_long_cmpxchg(v, c, c+1); if (likely(old == c)) return true; c = old; @@ -196,17 +196,17 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, struct user_namespace *tns; ucounts = get_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { - int max; + long max; tns = iter->ns; max = READ_ONCE(tns->ucount_max[type]); - if (!atomic_inc_below(&iter->ucount[type], max)) + if (!atomic_long_inc_below(&iter->ucount[type], max)) goto fail; } return ucounts; fail: bad = iter; for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) - atomic_dec(&iter->ucount[type]); + atomic_long_dec(&iter->ucount[type]); put_ucounts(ucounts); return NULL; @@ -216,7 +216,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) { struct ucounts *iter; for (iter = ucounts; iter; iter = iter->ns->ucounts) { - int dec = atomic_dec_if_positive(&iter->ucount[type]); + long dec = atomic_long_dec_if_positive(&iter->ucount[type]); WARN_ON_ONCE(dec < 0); } put_ucounts(ucounts); -- cgit v1.2.3 From 905ae01c4ae2ae3df05bb141801b1db4b7d83c61 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:09 +0200 Subject: Add a reference to ucounts for each cred For RLIMIT_NPROC and some other rlimits the user_struct that holds the global limit is kept alive for the lifetime of a process by keeping it in struct cred. Adding a pointer to ucounts in the struct cred will allow to track RLIMIT_NPROC not only for user in the system, but for user in the user_namespace. Updating ucounts may require memory allocation which may fail. So, we cannot change cred.ucounts in the commit_creds() because this function cannot fail and it should always return 0. For this reason, we modify cred.ucounts before calling the commit_creds(). Changelog v6: * Fix null-ptr-deref in is_ucounts_overlimit() detected by trinity. This error was caused by the fact that cred_alloc_blank() left the ucounts pointer empty. Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/b37aaef28d8b9b0d757e07ba6dd27281bbe39259.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/cred.c | 40 ++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 6 ++++++ kernel/sys.c | 12 ++++++++++++ kernel/ucount.c | 40 +++++++++++++++++++++++++++++++++++++--- kernel/user_namespace.c | 3 +++ 5 files changed, 98 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 421b1149c651..58a8a9e24347 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -60,6 +60,7 @@ struct cred init_cred = { .user = INIT_USER, .user_ns = &init_user_ns, .group_info = &init_groups, + .ucounts = &init_ucounts, }; static inline void set_cred_subscribers(struct cred *cred, int n) @@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu) if (cred->group_info) put_group_info(cred->group_info); free_uid(cred->user); + if (cred->ucounts) + put_ucounts(cred->ucounts); put_user_ns(cred->user_ns); kmem_cache_free(cred_jar, cred); } @@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_DEBUG_CREDENTIALS new->magic = CRED_MAGIC; #endif + new->ucounts = get_ucounts(&init_ucounts); if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) goto error; @@ -284,6 +288,11 @@ struct cred *prepare_creds(void) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + validate_creds(new); return new; @@ -363,6 +372,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ret = create_user_ns(new); if (ret < 0) goto error_put; + if (set_cred_ucounts(new) < 0) + goto error_put; } #ifdef CONFIG_KEYS @@ -653,6 +664,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b) } EXPORT_SYMBOL(cred_fscmp); +int set_cred_ucounts(struct cred *new) +{ + struct task_struct *task = current; + const struct cred *old = task->real_cred; + struct ucounts *old_ucounts = new->ucounts; + + if (new->user == old->user && new->user_ns == old->user_ns) + return 0; + + /* + * This optimization is needed because alloc_ucounts() uses locks + * for table lookups. + */ + if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid)) + return 0; + + if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid))) + return -EAGAIN; + + if (old_ucounts) + put_ucounts(old_ucounts); + + return 0; +} + /* * initialise the credentials stuff */ @@ -719,6 +755,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) goto error; + new->ucounts = get_ucounts(new->ucounts); + if (!new->ucounts) + goto error; + put_cred(old); validate_creds(new); return new; diff --git a/kernel/fork.c b/kernel/fork.c index 426cd0c51f9e..321a5e31d817 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2995,6 +2995,12 @@ int ksys_unshare(unsigned long unshare_flags) if (err) goto bad_unshare_cleanup_cred; + if (new_cred) { + err = set_cred_ucounts(new_cred); + if (err) + goto bad_unshare_cleanup_cred; + } + if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* diff --git a/kernel/sys.c b/kernel/sys.c index 2e2e3f378d97..cabfc5b86175 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -552,6 +552,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -610,6 +614,10 @@ long __sys_setuid(uid_t uid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: @@ -685,6 +693,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) if (retval < 0) goto error; + retval = set_cred_ucounts(new); + if (retval < 0) + goto error; + return commit_creds(new); error: diff --git a/kernel/ucount.c b/kernel/ucount.c index 04c561751af1..50cc1dfb7d28 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -8,6 +8,12 @@ #include #include +struct ucounts init_ucounts = { + .ns = &init_user_ns, + .uid = GLOBAL_ROOT_UID, + .count = 1, +}; + #define UCOUNTS_HASHTABLE_BITS 10 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static DEFINE_SPINLOCK(ucounts_lock); @@ -125,7 +131,15 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc return NULL; } -static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) +static void hlist_add_ucounts(struct ucounts *ucounts) +{ + struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid); + spin_lock_irq(&ucounts_lock); + hlist_add_head(&ucounts->node, hashent); + spin_unlock_irq(&ucounts_lock); +} + +struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct ucounts *ucounts, *new; @@ -160,7 +174,26 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) return ucounts; } -static void put_ucounts(struct ucounts *ucounts) +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + unsigned long flags; + + if (!ucounts) + return NULL; + + spin_lock_irqsave(&ucounts_lock, flags); + if (ucounts->count == INT_MAX) { + WARN_ONCE(1, "ucounts: counter has reached its maximum value"); + ucounts = NULL; + } else { + ucounts->count += 1; + } + spin_unlock_irqrestore(&ucounts_lock, flags); + + return ucounts; +} + +void put_ucounts(struct ucounts *ucounts) { unsigned long flags; @@ -194,7 +227,7 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, { struct ucounts *ucounts, *iter, *bad; struct user_namespace *tns; - ucounts = get_ucounts(ns, uid); + ucounts = alloc_ucounts(ns, uid); for (iter = ucounts; iter; iter = tns->ucounts) { long max; tns = iter->ns; @@ -237,6 +270,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!user_header); BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif + hlist_add_ucounts(&init_ucounts); return 0; } subsys_initcall(user_namespace_sysctl_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9a4b980d695b..f1b7b4b8ffa2 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -1340,6 +1340,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns) put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); + if (set_cred_ucounts(cred) < 0) + return -EINVAL; + return 0; } -- cgit v1.2.3 From b6c336528926ef73b0f70260f2636de2c3b94c14 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:10 +0200 Subject: Use atomic_t for ucounts reference counting The current implementation of the ucounts reference counter requires the use of spin_lock. We're going to use get_ucounts() in more performance critical areas like a handling of RLIMIT_SIGPENDING. Now we need to use spin_lock only if we want to change the hashtable. v10: * Always try to put ucounts in case we cannot increase ucounts->count. This will allow to cover the case when all consumers will return ucounts at once. v9: * Use a negative value to check that the ucounts->count is close to overflow. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/94d1dbecab060a6b116b0a2d1accd8ca1bbb4f5f.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/ucount.c | 53 +++++++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index 50cc1dfb7d28..365865f368ec 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -11,7 +11,7 @@ struct ucounts init_ucounts = { .ns = &init_user_ns, .uid = GLOBAL_ROOT_UID, - .count = 1, + .count = ATOMIC_INIT(1), }; #define UCOUNTS_HASHTABLE_BITS 10 @@ -139,6 +139,15 @@ static void hlist_add_ucounts(struct ucounts *ucounts) spin_unlock_irq(&ucounts_lock); } +struct ucounts *get_ucounts(struct ucounts *ucounts) +{ + if (ucounts && atomic_add_negative(1, &ucounts->count)) { + put_ucounts(ucounts); + ucounts = NULL; + } + return ucounts; +} + struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) { struct hlist_head *hashent = ucounts_hashentry(ns, uid); @@ -155,7 +164,7 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) new->ns = ns; new->uid = uid; - new->count = 0; + atomic_set(&new->count, 1); spin_lock_irq(&ucounts_lock); ucounts = find_ucounts(ns, uid, hashent); @@ -163,33 +172,12 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid) kfree(new); } else { hlist_add_head(&new->node, hashent); - ucounts = new; + spin_unlock_irq(&ucounts_lock); + return new; } } - if (ucounts->count == INT_MAX) - ucounts = NULL; - else - ucounts->count += 1; spin_unlock_irq(&ucounts_lock); - return ucounts; -} - -struct ucounts *get_ucounts(struct ucounts *ucounts) -{ - unsigned long flags; - - if (!ucounts) - return NULL; - - spin_lock_irqsave(&ucounts_lock, flags); - if (ucounts->count == INT_MAX) { - WARN_ONCE(1, "ucounts: counter has reached its maximum value"); - ucounts = NULL; - } else { - ucounts->count += 1; - } - spin_unlock_irqrestore(&ucounts_lock, flags); - + ucounts = get_ucounts(ucounts); return ucounts; } @@ -197,15 +185,12 @@ void put_ucounts(struct ucounts *ucounts) { unsigned long flags; - spin_lock_irqsave(&ucounts_lock, flags); - ucounts->count -= 1; - if (!ucounts->count) + if (atomic_dec_and_test(&ucounts->count)) { + spin_lock_irqsave(&ucounts_lock, flags); hlist_del_init(&ucounts->node); - else - ucounts = NULL; - spin_unlock_irqrestore(&ucounts_lock, flags); - - kfree(ucounts); + spin_unlock_irqrestore(&ucounts_lock, flags); + kfree(ucounts); + } } static inline bool atomic_long_inc_below(atomic_long_t *v, int u) -- cgit v1.2.3 From 21d1c5e386bc751f1953b371d72cd5b7d9c9e270 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:11 +0200 Subject: Reimplement RLIMIT_NPROC on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. To illustrate the impact of rlimits, let's say there is a program that does not fork. Some service-A wants to run this program as user X in multiple containers. Since the program never fork the service wants to set RLIMIT_NPROC=1. service-A \- program (uid=1000, container1, rlimit_nproc=1) \- program (uid=1000, container2, rlimit_nproc=1) The service-A sets RLIMIT_NPROC=1 and runs the program in container1. When the service-A tries to run a program with RLIMIT_NPROC=1 in container2 it fails since user X already has one running process. We cannot use existing inc_ucounts / dec_ucounts because they do not allow us to exceed the maximum for the counter. Some rlimits can be overlimited by root or if the user has the appropriate capability. Changelog v11: * Change inc_rlimit_ucounts() which now returns top value of ucounts. * Drop inc_rlimit_ucounts_and_test() because the return code of inc_rlimit_ucounts() can be checked. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/c5286a8aa16d2d698c222f7532f3d735c82bc6bc.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/cred.c | 10 +++++----- kernel/exit.c | 2 +- kernel/fork.c | 9 +++++---- kernel/sys.c | 2 +- kernel/ucount.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ kernel/user.c | 1 - kernel/user_namespace.c | 3 ++- 7 files changed, 58 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 58a8a9e24347..dcfa30b337c5 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -360,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) kdebug("share_creds(%p{%d,%d})", p->cred, atomic_read(&p->cred->usage), read_cred_subscribers(p->cred)); - atomic_inc(&p->cred->user->processes); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); return 0; } @@ -395,8 +395,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) } #endif - atomic_inc(&new->user->processes); p->cred = p->real_cred = get_cred(new); + inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(new, 2); validate_creds(new); return 0; @@ -496,12 +496,12 @@ int commit_creds(struct cred *new) * in set_user(). */ alter_cred_subscribers(new, 2); - if (new->user != old->user) - atomic_inc(&new->user->processes); + if (new->user != old->user || new->user_ns != old->user_ns) + inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user) - atomic_dec(&old->user->processes); + dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); alter_cred_subscribers(old, -2); /* send notifications */ diff --git a/kernel/exit.c b/kernel/exit.c index 04029e35e69a..61c0fe902b50 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -188,7 +188,7 @@ repeat: /* don't need to get the RCU readlock here - the process is dead and * can't be modifying its own credentials. But shut RCU-lockdep up */ rcu_read_lock(); - atomic_dec(&__task_cred(p)->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); rcu_read_unlock(); cgroup_release(p); diff --git a/kernel/fork.c b/kernel/fork.c index 321a5e31d817..ed7dfb07178d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,9 +819,11 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < UCOUNT_COUNTS; i++) + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; + init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); + #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); @@ -1978,8 +1980,7 @@ static __latent_entropy struct task_struct *copy_process( DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = -EAGAIN; - if (atomic_read(&p->real_cred->user->processes) >= - task_rlimit(p, RLIMIT_NPROC)) { + if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_free; @@ -2382,7 +2383,7 @@ bad_fork_cleanup_threadgroup_lock: #endif delayacct_tsk_free(p); bad_fork_cleanup_count: - atomic_dec(&p->cred->user->processes); + dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: p->state = TASK_DEAD; diff --git a/kernel/sys.c b/kernel/sys.c index cabfc5b86175..00266a65a000 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -473,7 +473,7 @@ static int set_user(struct cred *new) * for programs doing set*uid()+execve() by harmlessly deferring the * failure to the execve() stage. */ - if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && + if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) && new_user != INIT_USER) current->flags |= PF_NPROC_EXCEEDED; else diff --git a/kernel/ucount.c b/kernel/ucount.c index 365865f368ec..6caa56f7dec8 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { } }; #endif /* CONFIG_SYSCTL */ @@ -240,6 +241,48 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) put_ucounts(ucounts); } +long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long ret = 0; + + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long max = READ_ONCE(iter->ns->ucount_max[type]); + long new = atomic_long_add_return(v, &iter->ucount[type]); + if (new < 0 || new > max) + ret = LONG_MAX; + else if (iter == ucounts) + ret = new; + } + return ret; +} + +bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) +{ + struct ucounts *iter; + long new; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + long dec = atomic_long_add_return(-v, &iter->ucount[type]); + WARN_ON_ONCE(dec < 0); + if (iter == ucounts) + new = dec; + } + return (new == 0); +} + +bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max) +{ + struct ucounts *iter; + if (get_ucounts_value(ucounts, type) > max) + return true; + for (iter = ucounts; iter; iter = iter->ns->ucounts) { + max = READ_ONCE(iter->ns->ucount_max[type]); + if (get_ucounts_value(iter, type) > max) + return true; + } + return false; +} + static __init int user_namespace_sysctl_init(void) { #ifdef CONFIG_SYSCTL @@ -256,6 +299,7 @@ static __init int user_namespace_sysctl_init(void) BUG_ON(!setup_userns_sysctls(&init_user_ns)); #endif hlist_add_ucounts(&init_ucounts); + inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1); return 0; } subsys_initcall(user_namespace_sysctl_init); diff --git a/kernel/user.c b/kernel/user.c index a2478cddf536..7f5ff498207a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .processes = ATOMIC_INIT(1), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index f1b7b4b8ffa2..e6577c835072 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -119,9 +119,10 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); - for (i = 0; i < UCOUNT_COUNTS; i++) { + for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } + ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From 6e52a9f0532f912af37bab4caf18b57d1b9845f4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:12 +0200 Subject: Reimplement RLIMIT_MSGQUEUE on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/2531f42f7884bbfee56a978040b3e0d25cdf6cde.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/fork.c | 1 + kernel/ucount.c | 1 + kernel/user_namespace.c | 1 + 3 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ed7dfb07178d..a9c5097dfc86 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -823,6 +823,7 @@ void __init fork_init(void) init_user_ns.ucount_max[i] = max_threads/2; init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); + init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/ucount.c b/kernel/ucount.c index 6caa56f7dec8..6e6f936a5963 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { } }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index e6577c835072..7eccc4f84549 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -123,6 +123,7 @@ int create_user_ns(struct cred *new) ns->ucount_max[i] = INT_MAX; } ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); + ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From d64696905554e919321e31afc210606653b8f6a4 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:13 +0200 Subject: Reimplement RLIMIT_SIGPENDING on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Revert most of changes to fix performance issues. v10: * Fix memory leak on get_ucounts failure. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/df9d7764dddd50f28616b7840de74ec0f81711a8.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/fork.c | 1 + kernel/signal.c | 25 +++++++++++++------------ kernel/ucount.c | 1 + kernel/user.c | 1 - kernel/user_namespace.c | 1 + 5 files changed, 16 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a9c5097dfc86..03119926b27d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -824,6 +824,7 @@ void __init fork_init(void) init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); + init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/signal.c b/kernel/signal.c index f2718350bf4b..9a6dab712123 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -413,8 +413,8 @@ static struct sigqueue * __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) { struct sigqueue *q = NULL; - struct user_struct *user; - int sigpending; + struct ucounts *ucounts = NULL; + long sigpending; /* * Protect access to @t credentials. This can go away when all @@ -425,27 +425,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi * changes from/to zero. */ rcu_read_lock(); - user = __task_cred(t)->user; - sigpending = atomic_inc_return(&user->sigpending); + ucounts = task_ucounts(t); + sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1); if (sigpending == 1) - get_uid(user); + ucounts = get_ucounts(ucounts); rcu_read_unlock(); - if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { + if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } if (unlikely(q == NULL)) { - if (atomic_dec_and_test(&user->sigpending)) - free_uid(user); + if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) + put_ucounts(ucounts); } else { INIT_LIST_HEAD(&q->list); q->flags = 0; - q->user = user; + q->ucounts = ucounts; } - return q; } @@ -453,8 +452,10 @@ static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; - if (atomic_dec_and_test(&q->user->sigpending)) - free_uid(q->user); + if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) { + put_ucounts(q->ucounts); + q->ucounts = NULL; + } kmem_cache_free(sigqueue_cachep, q); } diff --git a/kernel/ucount.c b/kernel/ucount.c index 6e6f936a5963..8ce62da6a62c 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -80,6 +80,7 @@ static struct ctl_table user_table[] = { UCOUNT_ENTRY("max_inotify_instances"), UCOUNT_ENTRY("max_inotify_watches"), #endif + { }, { }, { }, { } diff --git a/kernel/user.c b/kernel/user.c index 7f5ff498207a..6737327f83be 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 7eccc4f84549..822eacee4588 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -124,6 +124,7 @@ int create_user_ns(struct cred *new) } ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); + ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From d7c9e99aee48e6bc0b427f3e3c658a6aba15001e Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:14 +0200 Subject: Reimplement RLIMIT_MEMLOCK on top of ucounts The rlimit counter is tied to uid in the user_namespace. This allows rlimit values to be specified in userns even if they are already globally exceeded by the user. However, the value of the previous user_namespaces cannot be exceeded. Changelog v11: * Fix issue found by lkp robot. v8: * Fix issues found by lkp-tests project. v7: * Keep only ucounts for RLIMIT_MEMLOCK checks instead of struct cred. v6: * Fix bug in hugetlb_file_setup() detected by trinity. Reported-by: kernel test robot Reported-by: kernel test robot Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/970d50c70c71bfd4496e0e8d2a0a32feebebb350.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/fork.c | 1 + kernel/ucount.c | 1 + kernel/user.c | 1 - kernel/user_namespace.c | 1 + 4 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 03119926b27d..610fd4de60d7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -825,6 +825,7 @@ void __init fork_init(void) init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); + init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/ucount.c b/kernel/ucount.c index 8ce62da6a62c..d316bac3e520 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -83,6 +83,7 @@ static struct ctl_table user_table[] = { { }, { }, { }, + { }, { } }; #endif /* CONFIG_SYSCTL */ diff --git a/kernel/user.c b/kernel/user.c index 6737327f83be..c82399c1618a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -98,7 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); /* root_user.__count is 1, for init task cred */ struct user_struct root_user = { .__count = REFCOUNT_INIT(1), - .locked_shm = 0, .uid = GLOBAL_ROOT_UID, .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 822eacee4588..892da1360862 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -125,6 +125,7 @@ int create_user_ns(struct cred *new) ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); + ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From c1ada3dc7219b02b3467aa906c2f5f8b098578d1 Mon Sep 17 00:00:00 2001 From: Alexey Gladkov Date: Thu, 22 Apr 2021 14:27:16 +0200 Subject: ucounts: Set ucount_max to the largest positive value the type can hold The ns->ucount_max[] is signed long which is less than the rlimit size. We have to protect ucount_max[] from overflow and only use the largest value that we can hold. On 32bit using "long" instead of "unsigned long" to hold the counts has the downside that RLIMIT_MSGQUEUE and RLIMIT_MEMLOCK are limited to 2GiB instead of 4GiB. I don't think anyone cares but it should be mentioned in case someone does. The RLIMIT_NPROC and RLIMIT_SIGPENDING used atomic_t so their maximum hasn't changed. Signed-off-by: Alexey Gladkov Link: https://lkml.kernel.org/r/1825a5dfa18bc5a570e79feb05e2bd07fd57e7e3.1619094428.git.legion@kernel.org Signed-off-by: Eric W. Biederman --- kernel/fork.c | 8 ++++---- kernel/user_namespace.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 610fd4de60d7..c41820481b2e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -822,10 +822,10 @@ void __init fork_init(void) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - init_user_ns.ucount_max[UCOUNT_RLIMIT_NPROC] = task_rlimit(&init_task, RLIMIT_NPROC); - init_user_ns.ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = task_rlimit(&init_task, RLIMIT_MSGQUEUE); - init_user_ns.ucount_max[UCOUNT_RLIMIT_SIGPENDING] = task_rlimit(&init_task, RLIMIT_SIGPENDING); - init_user_ns.ucount_max[UCOUNT_RLIMIT_MEMLOCK] = task_rlimit(&init_task, RLIMIT_MEMLOCK); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK)); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 892da1360862..d4a545bbab7f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -122,10 +122,10 @@ int create_user_ns(struct cred *new) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) { ns->ucount_max[i] = INT_MAX; } - ns->ucount_max[UCOUNT_RLIMIT_NPROC] = rlimit(RLIMIT_NPROC); - ns->ucount_max[UCOUNT_RLIMIT_MSGQUEUE] = rlimit(RLIMIT_MSGQUEUE); - ns->ucount_max[UCOUNT_RLIMIT_SIGPENDING] = rlimit(RLIMIT_SIGPENDING); - ns->ucount_max[UCOUNT_RLIMIT_MEMLOCK] = rlimit(RLIMIT_MEMLOCK); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); + set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ -- cgit v1.2.3 From f928ef685db5d9b82c1c1e24e229c167426c5a1f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 30 Apr 2021 13:00:26 -0500 Subject: ucounts: Silence warning in dec_rlimit_ucounts Dan Carpenter wrote: > > url: https://github.com/0day-ci/linux/commits/legion-kernel-org/Count-rlimits-in-each-user-namespace/20210427-162857 > base: https://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git next > config: arc-randconfig-m031-20210426 (attached as .config) > compiler: arceb-elf-gcc (GCC) 9.3.0 > > If you fix the issue, kindly add following tag as appropriate > Reported-by: kernel test robot > Reported-by: Dan Carpenter > > smatch warnings: > kernel/ucount.c:270 dec_rlimit_ucounts() error: uninitialized symbol 'new'. > > vim +/new +270 kernel/ucount.c > > 176ec2b092cc22 Alexey Gladkov 2021-04-22 260 bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) > 176ec2b092cc22 Alexey Gladkov 2021-04-22 261 { > 176ec2b092cc22 Alexey Gladkov 2021-04-22 262 struct ucounts *iter; > 176ec2b092cc22 Alexey Gladkov 2021-04-22 263 long new; > ^^^^^^^^ > > 176ec2b092cc22 Alexey Gladkov 2021-04-22 264 for (iter = ucounts; iter; iter = iter->ns->ucounts) { > 176ec2b092cc22 Alexey Gladkov 2021-04-22 265 long dec = atomic_long_add_return(-v, &iter->ucount[type]); > 176ec2b092cc22 Alexey Gladkov 2021-04-22 266 WARN_ON_ONCE(dec < 0); > 176ec2b092cc22 Alexey Gladkov 2021-04-22 267 if (iter == ucounts) > 176ec2b092cc22 Alexey Gladkov 2021-04-22 268 new = dec; > 176ec2b092cc22 Alexey Gladkov 2021-04-22 269 } > 176ec2b092cc22 Alexey Gladkov 2021-04-22 @270 return (new == 0); > ^^^^^^^^ > I don't know if this is a bug or not, but I can definitely tell why the > static checker complains about it. > > 176ec2b092cc22 Alexey Gladkov 2021-04-22 271 } In the only two cases that care about the return value of dec_rlimit_ucounts the code first tests to see that ucounts is not NULL. In those cases it is guaranteed at least one iteration of the loop will execute guaranteeing the variable new will be initialized. Initialize new to -1 so that the return value is well defined even when the loop does not execute and the static checker is silenced. Link: https://lkml.kernel.org/r/m1tunny77w.fsf@fess.ebiederm.org Signed-off-by: "Eric W. Biederman" --- kernel/ucount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ucount.c b/kernel/ucount.c index d316bac3e520..df84a2a63926 100644 --- a/kernel/ucount.c +++ b/kernel/ucount.c @@ -263,7 +263,7 @@ long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v) { struct ucounts *iter; - long new; + long new = -1; /* Silence compiler warning */ for (iter = ucounts; iter; iter = iter->ns->ucounts) { long dec = atomic_long_add_return(-v, &iter->ucount[type]); WARN_ON_ONCE(dec < 0); -- cgit v1.2.3 From 5e6b8a50a7cec5686ee2c4bda1d49899c79a7eae Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 26 May 2021 22:38:05 +0800 Subject: cred: add missing return error code when set_cred_ucounts() failed If set_cred_ucounts() failed, we need return the error code. Fixes: 905ae01c4ae2 ("Add a reference to ucounts for each cred") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Link: https://lkml.kernel.org/r/20210526143805.2549649-1-yangyingliang@huawei.com Reviewed-by: Alexey Gladkov Signed-off-by: Eric W. Biederman --- kernel/cred.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index dcfa30b337c5..5a1d9702658e 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -372,7 +372,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ret = create_user_ns(new); if (ret < 0) goto error_put; - if (set_cred_ucounts(new) < 0) + ret = set_cred_ucounts(new); + if (ret < 0) goto error_put; } -- cgit v1.2.3