aboutsummaryrefslogtreecommitdiff
path: root/kernel/bpf/syscall.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/bpf/syscall.c')
-rw-r--r--kernel/bpf/syscall.c250
1 files changed, 168 insertions, 82 deletions
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 4ff82144f885..a3c0214ca934 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -9,6 +9,7 @@
#include <linux/bpf_verifier.h>
#include <linux/bsearch.h>
#include <linux/btf.h>
+#include <linux/hex.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/sched/signal.h>
@@ -133,12 +134,14 @@ bool bpf_map_write_active(const struct bpf_map *map)
return atomic64_read(&map->writecnt) != 0;
}
-static u32 bpf_map_value_size(const struct bpf_map *map)
+static u32 bpf_map_value_size(const struct bpf_map *map, u64 flags)
{
- if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ if (flags & (BPF_F_CPU | BPF_F_ALL_CPUS))
+ return map->value_size;
+ else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
return round_up(map->value_size, 8) * num_possible_cpus();
else if (IS_FD_MAP(map))
return sizeof(u32);
@@ -314,11 +317,11 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
- err = bpf_percpu_hash_copy(map, key, value);
+ err = bpf_percpu_hash_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- err = bpf_percpu_array_copy(map, key, value);
+ err = bpf_percpu_array_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
- err = bpf_percpu_cgroup_storage_copy(map, key, value);
+ err = bpf_percpu_cgroup_storage_copy(map, key, value, flags);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_extract(map, key, value, false);
} else if (IS_FD_ARRAY(map) || IS_FD_PROG_ARRAY(map)) {
@@ -505,17 +508,29 @@ static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
return root_mem_cgroup;
}
+void bpf_map_memcg_enter(const struct bpf_map *map, struct mem_cgroup **old_memcg,
+ struct mem_cgroup **new_memcg)
+{
+ *new_memcg = bpf_map_get_memcg(map);
+ *old_memcg = set_active_memcg(*new_memcg);
+}
+
+void bpf_map_memcg_exit(struct mem_cgroup *old_memcg,
+ struct mem_cgroup *new_memcg)
+{
+ set_active_memcg(old_memcg);
+ mem_cgroup_put(new_memcg);
+}
+
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -526,11 +541,9 @@ void *bpf_map_kmalloc_nolock(const struct bpf_map *map, size_t size, gfp_t flags
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kmalloc_nolock(size, flags | __GFP_ACCOUNT, node);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -540,11 +553,9 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -555,11 +566,9 @@ void *bpf_map_kvcalloc(struct bpf_map *map, size_t n, size_t size,
struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = kvcalloc(n, size, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -570,11 +579,9 @@ void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
+ bpf_map_memcg_enter(map, &old_memcg, &memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
+ bpf_map_memcg_exit(old_memcg, memcg);
return ptr;
}
@@ -612,12 +619,7 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
unsigned long i, j;
struct page *pg;
int ret = 0;
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg, *old_memcg;
- memcg = bpf_map_get_memcg(map);
- old_memcg = set_active_memcg(memcg);
-#endif
for (i = 0; i < nr_pages; i++) {
pg = __bpf_alloc_page(nid);
@@ -631,10 +633,6 @@ int bpf_map_alloc_pages(const struct bpf_map *map, int nid,
break;
}
-#ifdef CONFIG_MEMCG
- set_active_memcg(old_memcg);
- mem_cgroup_put(memcg);
-#endif
return ret;
}
@@ -943,14 +941,6 @@ static void bpf_map_free_rcu_gp(struct rcu_head *rcu)
bpf_map_free_in_work(container_of(rcu, struct bpf_map, rcu));
}
-static void bpf_map_free_mult_rcu_gp(struct rcu_head *rcu)
-{
- if (rcu_trace_implies_rcu_gp())
- bpf_map_free_rcu_gp(rcu);
- else
- call_rcu(rcu, bpf_map_free_rcu_gp);
-}
-
/* decrement map refcnt and schedule it for freeing via workqueue
* (underlying map implementation ops->map_free() might sleep)
*/
@@ -961,8 +951,9 @@ void bpf_map_put(struct bpf_map *map)
bpf_map_free_id(map);
WARN_ON_ONCE(atomic64_read(&map->sleepable_refcnt));
+ /* RCU tasks trace grace period implies RCU grace period. */
if (READ_ONCE(map->free_after_mult_rcu_gp))
- call_rcu_tasks_trace(&map->rcu, bpf_map_free_mult_rcu_gp);
+ call_rcu_tasks_trace(&map->rcu, bpf_map_free_rcu_gp);
else if (READ_ONCE(map->free_after_rcu_gp))
call_rcu(&map->rcu, bpf_map_free_rcu_gp);
else
@@ -1236,7 +1227,7 @@ int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size)
}
EXPORT_SYMBOL_GPL(bpf_obj_name_cpy);
-int map_check_no_btf(const struct bpf_map *map,
+int map_check_no_btf(struct bpf_map *map,
const struct btf *btf,
const struct btf_type *key_type,
const struct btf_type *value_type)
@@ -1366,11 +1357,6 @@ free_map_tab:
return ret;
}
-static bool bpf_net_capable(void)
-{
- return capable(CAP_NET_ADMIN) || capable(CAP_SYS_ADMIN);
-}
-
#define BPF_MAP_CREATE_LAST_FIELD excl_prog_hash_size
/* called via syscall */
static int map_create(union bpf_attr *attr, bpfptr_t uattr)
@@ -1734,7 +1720,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ))
return -EPERM;
- err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->flags, BPF_F_LOCK | BPF_F_CPU);
if (err)
return err;
@@ -1742,7 +1728,7 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(key))
return PTR_ERR(key);
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->flags);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -1809,7 +1795,7 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->flags);
value = kvmemdup_bpfptr(uvalue, value_size);
if (IS_ERR(value)) {
err = PTR_ERR(value);
@@ -2005,11 +1991,12 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
void *key, *value;
int err = 0;
- err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags,
+ BPF_F_LOCK | BPF_F_CPU | BPF_F_ALL_CPUS);
if (err)
return err;
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->batch.elem_flags);
max_count = attr->batch.count;
if (!max_count)
@@ -2064,11 +2051,11 @@ int generic_map_lookup_batch(struct bpf_map *map,
u32 value_size, cp, max_count;
int err;
- err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK);
+ err = bpf_map_check_op_flags(map, attr->batch.elem_flags, BPF_F_LOCK | BPF_F_CPU);
if (err)
return err;
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, attr->batch.elem_flags);
max_count = attr->batch.count;
if (!max_count)
@@ -2190,7 +2177,7 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr)
goto err_put;
}
- value_size = bpf_map_value_size(map);
+ value_size = bpf_map_value_size(map, 0);
err = -ENOMEM;
value = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
@@ -2820,6 +2807,13 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
void *sig;
int err = 0;
+ /*
+ * Don't attempt to use kmalloc_large or vmalloc for signatures.
+ * Practical signature for BPF program should be below this limit.
+ */
+ if (attr->signature_size > KMALLOC_MAX_CACHE_SIZE)
+ return -EINVAL;
+
if (system_keyring_id_check(attr->keyring_id) == 0)
key = bpf_lookup_system_key(attr->keyring_id);
else
@@ -2831,7 +2825,7 @@ static int bpf_prog_verify_signature(struct bpf_prog *prog, union bpf_attr *attr
sig = kvmemdup_bpfptr(usig, attr->signature_size);
if (IS_ERR(sig)) {
bpf_key_put(key);
- return -ENOMEM;
+ return PTR_ERR(sig);
}
bpf_dynptr_init(&sig_ptr, sig, BPF_DYNPTR_TYPE_LOCAL, 0,
@@ -3089,10 +3083,6 @@ static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
if (err < 0)
goto free_used_maps;
- prog = bpf_prog_select_runtime(prog, &err);
- if (err < 0)
- goto free_used_maps;
-
err = bpf_prog_mark_insn_arrays_ready(prog);
if (err < 0)
goto free_used_maps;
@@ -3260,12 +3250,16 @@ static void bpf_link_defer_dealloc_rcu_gp(struct rcu_head *rcu)
bpf_link_dealloc(link);
}
-static void bpf_link_defer_dealloc_mult_rcu_gp(struct rcu_head *rcu)
+static bool bpf_link_is_tracepoint(struct bpf_link *link)
{
- if (rcu_trace_implies_rcu_gp())
- bpf_link_defer_dealloc_rcu_gp(rcu);
- else
- call_rcu(rcu, bpf_link_defer_dealloc_rcu_gp);
+ /*
+ * Only these combinations support a tracepoint bpf_link.
+ * BPF_LINK_TYPE_TRACING raw_tp progs are hardcoded to use
+ * bpf_raw_tp_link_lops and thus dealloc_deferred(), see
+ * bpf_raw_tp_link_attach().
+ */
+ return link->type == BPF_LINK_TYPE_RAW_TRACEPOINT ||
+ (link->type == BPF_LINK_TYPE_TRACING && link->attach_type == BPF_TRACE_RAW_TP);
}
/* bpf_link_free is guaranteed to be called from process context */
@@ -3278,16 +3272,26 @@ static void bpf_link_free(struct bpf_link *link)
if (link->prog)
ops->release(link);
if (ops->dealloc_deferred) {
- /* Schedule BPF link deallocation, which will only then
+ /*
+ * Schedule BPF link deallocation, which will only then
* trigger putting BPF program refcount.
* If underlying BPF program is sleepable or BPF link's target
* attach hookpoint is sleepable or otherwise requires RCU GPs
* to ensure link and its underlying BPF program is not
* reachable anymore, we need to first wait for RCU tasks
- * trace sync, and then go through "classic" RCU grace period
+ * trace sync, and then go through "classic" RCU grace period.
+ *
+ * For tracepoint BPF links, we need to go through SRCU grace
+ * period wait instead when non-faultable tracepoint is used. We
+ * don't need to chain SRCU grace period waits, however, for the
+ * faultable case, since it exclusively uses RCU Tasks Trace.
*/
if (link->sleepable || (link->prog && link->prog->sleepable))
- call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_mult_rcu_gp);
+ /* RCU Tasks Trace grace period implies RCU grace period. */
+ call_rcu_tasks_trace(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
+ /* We need to do a SRCU grace period wait for non-faultable tracepoint BPF links. */
+ else if (bpf_link_is_tracepoint(link))
+ call_tracepoint_unregister_atomic(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
else
call_rcu(&link->rcu, bpf_link_defer_dealloc_rcu_gp);
} else if (ops->dealloc) {
@@ -3579,6 +3583,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
case BPF_PROG_TYPE_TRACING:
if (prog->expected_attach_type != BPF_TRACE_FENTRY &&
prog->expected_attach_type != BPF_TRACE_FEXIT &&
+ prog->expected_attach_type != BPF_TRACE_FSESSION &&
prog->expected_attach_type != BPF_MODIFY_RETURN) {
err = -EINVAL;
goto out_put_prog;
@@ -3628,7 +3633,21 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id);
}
- link = kzalloc(sizeof(*link), GFP_USER);
+ if (prog->expected_attach_type == BPF_TRACE_FSESSION) {
+ struct bpf_fsession_link *fslink;
+
+ fslink = kzalloc_obj(*fslink, GFP_USER);
+ if (fslink) {
+ bpf_link_init(&fslink->fexit.link, BPF_LINK_TYPE_TRACING,
+ &bpf_tracing_link_lops, prog, attach_type);
+ fslink->fexit.cookie = bpf_cookie;
+ link = &fslink->link;
+ } else {
+ link = NULL;
+ }
+ } else {
+ link = kzalloc_obj(*link, GFP_USER);
+ }
if (!link) {
err = -ENOMEM;
goto out_put_prog;
@@ -3717,6 +3736,23 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog,
tr = prog->aux->dst_trampoline;
tgt_prog = prog->aux->dst_prog;
}
+ /*
+ * It is to prevent modifying struct pt_regs via kprobe_write_ctx=true
+ * freplace prog. Without this check, kprobe_write_ctx=true freplace
+ * prog is allowed to attach to kprobe_write_ctx=false kprobe prog, and
+ * then modify the registers of the kprobe prog's target kernel
+ * function.
+ *
+ * This also blocks the combination of uprobe+freplace, because it is
+ * unable to recognize the use of the tgt_prog as an uprobe or a kprobe
+ * by tgt_prog itself. At attach time, uprobe/kprobe is recognized by
+ * the target perf event flags in __perf_event_set_bpf_prog().
+ */
+ if (prog->type == BPF_PROG_TYPE_EXT &&
+ prog->aux->kprobe_write_ctx != tgt_prog->aux->kprobe_write_ctx) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
err = bpf_link_prime(&link->link.link, &link_primer);
if (err)
@@ -4167,7 +4203,7 @@ static int bpf_perf_link_attach(const union bpf_attr *attr, struct bpf_prog *pro
if (IS_ERR(perf_file))
return PTR_ERR(perf_file);
- link = kzalloc(sizeof(*link), GFP_USER);
+ link = kzalloc_obj(*link, GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_file;
@@ -4245,7 +4281,7 @@ static int bpf_raw_tp_link_attach(struct bpf_prog *prog,
if (!btp)
return -ENOENT;
- link = kzalloc(sizeof(*link), GFP_USER);
+ link = kzalloc_obj(*link, GFP_USER);
if (!link) {
err = -ENOMEM;
goto out_put_btp;
@@ -4352,6 +4388,7 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
case BPF_TRACE_RAW_TP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
+ case BPF_TRACE_FSESSION:
case BPF_MODIFY_RETURN:
return BPF_PROG_TYPE_TRACING;
case BPF_LSM_MAC:
@@ -4565,6 +4602,8 @@ static int bpf_prog_detach(const union bpf_attr *attr)
prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype);
if (IS_ERR(prog))
return PTR_ERR(prog);
+ } else if (!bpf_mprog_detach_empty(ptype)) {
+ return -EPERM;
}
} else if (is_cgroup_prog_type(ptype, 0, false)) {
if (attr->attach_flags || attr->relative_fd)
@@ -5310,6 +5349,9 @@ static int bpf_map_get_info_by_fd(struct file *file,
if (info.hash_size != SHA256_DIGEST_SIZE)
return -EINVAL;
+ if (!READ_ONCE(map->frozen))
+ return -EPERM;
+
err = map->ops->map_get_hash(map, SHA256_DIGEST_SIZE, map->sha);
if (err != 0)
return err;
@@ -6054,9 +6096,8 @@ static int bpf_prog_bind_map(union bpf_attr *attr)
goto out_unlock;
}
- used_maps_new = kmalloc_array(prog->aux->used_map_cnt + 1,
- sizeof(used_maps_new[0]),
- GFP_KERNEL);
+ used_maps_new = kmalloc_objs(used_maps_new[0],
+ prog->aux->used_map_cnt + 1);
if (!used_maps_new) {
ret = -ENOMEM;
goto out_unlock;
@@ -6122,6 +6163,49 @@ static int prog_stream_read(union bpf_attr *attr)
return ret;
}
+#define BPF_PROG_ASSOC_STRUCT_OPS_LAST_FIELD prog_assoc_struct_ops.prog_fd
+
+static int prog_assoc_struct_ops(union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ struct bpf_map *map;
+ int ret;
+
+ if (CHECK_ATTR(BPF_PROG_ASSOC_STRUCT_OPS))
+ return -EINVAL;
+
+ if (attr->prog_assoc_struct_ops.flags)
+ return -EINVAL;
+
+ prog = bpf_prog_get(attr->prog_assoc_struct_ops.prog_fd);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ if (prog->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto put_prog;
+ }
+
+ map = bpf_map_get(attr->prog_assoc_struct_ops.map_fd);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
+ goto put_prog;
+ }
+
+ if (map->map_type != BPF_MAP_TYPE_STRUCT_OPS) {
+ ret = -EINVAL;
+ goto put_map;
+ }
+
+ ret = bpf_prog_assoc_struct_ops(prog, map);
+
+put_map:
+ bpf_map_put(map);
+put_prog:
+ bpf_prog_put(prog);
+ return ret;
+}
+
static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
{
union bpf_attr attr;
@@ -6261,6 +6345,9 @@ static int __sys_bpf(enum bpf_cmd cmd, bpfptr_t uattr, unsigned int size)
case BPF_PROG_STREAM_READ_BY_FD:
err = prog_stream_read(&attr);
break;
+ case BPF_PROG_ASSOC_STRUCT_OPS:
+ err = prog_assoc_struct_ops(&attr);
+ break;
default:
err = -EINVAL;
break;
@@ -6281,8 +6368,7 @@ static bool syscall_prog_is_valid_access(int off, int size,
{
if (off < 0 || off >= U16_MAX)
return false;
- if (off % size != 0)
- return false;
+ /* No alignment requirements for syscall ctx accesses. */
return true;
}
@@ -6407,7 +6493,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.func = bpf_kallsyms_lookup_name,
.gpl_only = false,
.ret_type = RET_INTEGER,
- .arg1_type = ARG_PTR_TO_MEM,
+ .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED,