From 5d5de3a431d87ac51d43da8d796891d014975ab7 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 16 Feb 2023 10:48:21 +0800 Subject: bpf: Only allocate one bpf_mem_cache for bpf_cpumask_ma The size of bpf_cpumask is fixed, so there is no need to allocate many bpf_mem_caches for bpf_cpumask_ma, just one bpf_mem_cache is enough. Also add comments for bpf_mem_alloc_init() in bpf_mem_alloc.h to prevent future miuse. Signed-off-by: Hou Tao Acked-by: Jiri Olsa Link: https://lore.kernel.org/r/20230216024821.2202916-1-houtao@huaweicloud.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_mem_alloc.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_mem_alloc.h b/include/linux/bpf_mem_alloc.h index 3e164b8efaa9..a7104af61ab4 100644 --- a/include/linux/bpf_mem_alloc.h +++ b/include/linux/bpf_mem_alloc.h @@ -14,6 +14,13 @@ struct bpf_mem_alloc { struct work_struct work; }; +/* 'size != 0' is for bpf_mem_alloc which manages fixed-size objects. + * Alloc and free are done with bpf_mem_cache_{alloc,free}(). + * + * 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects. + * Alloc and free are done with bpf_mem_{alloc,free}() and the size of + * the returned object is given by the size argument of bpf_mem_alloc(). + */ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu); void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma); -- cgit v1.2.3 From 7e0dac2807e6c4ae8c56941d74971fdb0763b4f9 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:45 -0800 Subject: bpf: Refactor process_dynptr_func This change cleans up process_dynptr_func's flow to be more intuitive and updates some comments with more context. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-3-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index cf1bb1cf4a7b..b26ff2a8f63b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -616,9 +616,6 @@ int check_func_arg_reg_off(struct bpf_verifier_env *env, enum bpf_arg_type arg_type); int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, u32 mem_size); -struct bpf_call_arg_meta; -int process_dynptr_func(struct bpf_verifier_env *env, int regno, - enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, -- cgit v1.2.3 From 8357b366cbb09b17c90e2cd758360a6bd2ea7507 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:47 -0800 Subject: bpf: Define no-ops for externally called bpf dynptr functions Some bpf dynptr functions will be called from places where if CONFIG_BPF_SYSCALL is not set, then the dynptr function is undefined. For example, when skb type dynptrs are added in the next commit, dynptr functions are called from net/core/filter.c This patch defines no-op implementations of these dynptr functions so that they do not break compilation by being an undefined reference. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-5-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 75 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 520b238abd5a..296841a31749 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1124,6 +1124,33 @@ static __always_inline __nocfi unsigned int bpf_dispatcher_nop_func( return bpf_func(ctx, insnsi); } +/* the implementation of the opaque uapi struct bpf_dynptr */ +struct bpf_dynptr_kern { + void *data; + /* Size represents the number of usable bytes of dynptr data. + * If for example the offset is at 4 for a local dynptr whose data is + * of type u64, the number of usable bytes is 4. + * + * The upper 8 bits are reserved. It is as follows: + * Bits 0 - 23 = size + * Bits 24 - 30 = dynptr type + * Bit 31 = whether dynptr is read-only + */ + u32 size; + u32 offset; +} __aligned(8); + +enum bpf_dynptr_type { + BPF_DYNPTR_TYPE_INVALID, + /* Points to memory that is local to the bpf program */ + BPF_DYNPTR_TYPE_LOCAL, + /* Underlying data is a ringbuf record */ + BPF_DYNPTR_TYPE_RINGBUF, +}; + +int bpf_dynptr_check_size(u32 size); +u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); + #ifdef CONFIG_BPF_JIT int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr); @@ -2266,6 +2293,11 @@ static inline bool has_current_bpf_ctx(void) } void notrace bpf_prog_inc_misses_counter(struct bpf_prog *prog); + +void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size); +void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); +void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -2495,6 +2527,19 @@ static inline void bpf_prog_inc_misses_counter(struct bpf_prog *prog) static inline void bpf_cgrp_storage_free(struct cgroup *cgroup) { } + +static inline void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, + enum bpf_dynptr_type type, u32 offset, u32 size) +{ +} + +static inline void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr) +{ +} + +static inline void bpf_dynptr_set_rdonly(struct bpf_dynptr_kern *ptr) +{ +} #endif /* CONFIG_BPF_SYSCALL */ void __bpf_free_used_btfs(struct bpf_prog_aux *aux, @@ -2913,36 +2958,6 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, u32 num_args, struct bpf_bprintf_data *data); void bpf_bprintf_cleanup(struct bpf_bprintf_data *data); -/* the implementation of the opaque uapi struct bpf_dynptr */ -struct bpf_dynptr_kern { - void *data; - /* Size represents the number of usable bytes of dynptr data. - * If for example the offset is at 4 for a local dynptr whose data is - * of type u64, the number of usable bytes is 4. - * - * The upper 8 bits are reserved. It is as follows: - * Bits 0 - 23 = size - * Bits 24 - 30 = dynptr type - * Bit 31 = whether dynptr is read-only - */ - u32 size; - u32 offset; -} __aligned(8); - -enum bpf_dynptr_type { - BPF_DYNPTR_TYPE_INVALID, - /* Points to memory that is local to the bpf program */ - BPF_DYNPTR_TYPE_LOCAL, - /* Underlying data is a kernel-produced ringbuf record */ - BPF_DYNPTR_TYPE_RINGBUF, -}; - -void bpf_dynptr_init(struct bpf_dynptr_kern *ptr, void *data, - enum bpf_dynptr_type type, u32 offset, u32 size); -void bpf_dynptr_set_null(struct bpf_dynptr_kern *ptr); -int bpf_dynptr_check_size(u32 size); -u32 bpf_dynptr_get_size(const struct bpf_dynptr_kern *ptr); - #ifdef CONFIG_BPF_LSM void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype); void bpf_cgroup_atype_put(int cgroup_atype); -- cgit v1.2.3 From b5964b968ac64c2ec2debee7518499113b27c34e Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:50 -0800 Subject: bpf: Add skb dynptrs Add skb dynptrs, which are dynptrs whose underlying pointer points to a skb. The dynptr acts on skb data. skb dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of skb->data and skb->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For bpf prog types that don't support writes on skb data, the dynptr is read-only (bpf_dynptr_write() will return an error) For reads and writes through the bpf_dynptr_read() and bpf_dynptr_write() interfaces, reading and writing from/to data in the head as well as from/to non-linear paged buffers is supported. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() (added in subsequent commit) should be used. For examples of how skb dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-8-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 14 +++++++++++++- include/linux/filter.h | 18 ++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 296841a31749..e7436d7615b0 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -607,11 +607,14 @@ enum bpf_type_flag { */ NON_OWN_REF = BIT(14 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to sk_buff */ + DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF) +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1146,6 +1149,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_LOCAL, /* Underlying data is a ringbuf record */ BPF_DYNPTR_TYPE_RINGBUF, + /* Underlying data is a sk_buff */ + BPF_DYNPTR_TYPE_SKB, }; int bpf_dynptr_check_size(u32 size); @@ -2846,6 +2851,8 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size); +int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr); #else static inline bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, @@ -2867,6 +2874,11 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, { return 0; } +static inline int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, + struct bpf_dynptr_kern *ptr) +{ + return -EOPNOTSUPP; +} #endif #ifdef CONFIG_INET diff --git a/include/linux/filter.h b/include/linux/filter.h index 1727898f1641..de18e844d15e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1542,4 +1542,22 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index return XDP_REDIRECT; } +#ifdef CONFIG_NET +int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); +int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, + u32 len, u64 flags); +#else /* CONFIG_NET */ +static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, + void *to, u32 len) +{ + return -EOPNOTSUPP; +} + +static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_NET */ + #endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From 05421aecd4ed65da0dc17b0c3c13779ef334e9e5 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:51 -0800 Subject: bpf: Add xdp dynptrs Add xdp dynptrs, which are dynptrs whose underlying pointer points to a xdp_buff. The dynptr acts on xdp data. xdp dynptrs have two main benefits. One is that they allow operations on sizes that are not statically known at compile-time (eg variable-sized accesses). Another is that parsing the packet data through dynptrs (instead of through direct access of xdp->data and xdp->data_end) can be more ergonomic and less brittle (eg does not need manual if checking for being within bounds of data_end). For reads and writes on the dynptr, this includes reading/writing from/to and across fragments. Data slices through the bpf_dynptr_data API are not supported; instead bpf_dynptr_slice() and bpf_dynptr_slice_rdwr() should be used. For examples of how xdp dynptrs can be used, please see the attached selftests. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-9-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 +++++++- include/linux/filter.h | 14 ++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e7436d7615b0..23ec684e660d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -610,11 +610,15 @@ enum bpf_type_flag { /* DYNPTR points to sk_buff */ DYNPTR_TYPE_SKB = BIT(15 + BPF_BASE_TYPE_BITS), + /* DYNPTR points to xdp_buff */ + DYNPTR_TYPE_XDP = BIT(16 + BPF_BASE_TYPE_BITS), + __BPF_TYPE_FLAG_MAX, __BPF_TYPE_LAST_FLAG = __BPF_TYPE_FLAG_MAX - 1, }; -#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB) +#define DYNPTR_TYPE_FLAG_MASK (DYNPTR_TYPE_LOCAL | DYNPTR_TYPE_RINGBUF | DYNPTR_TYPE_SKB \ + | DYNPTR_TYPE_XDP) /* Max number of base types. */ #define BPF_BASE_TYPE_LIMIT (1UL << BPF_BASE_TYPE_BITS) @@ -1151,6 +1155,8 @@ enum bpf_dynptr_type { BPF_DYNPTR_TYPE_RINGBUF, /* Underlying data is a sk_buff */ BPF_DYNPTR_TYPE_SKB, + /* Underlying data is a xdp_buff */ + BPF_DYNPTR_TYPE_XDP, }; int bpf_dynptr_check_size(u32 size); diff --git a/include/linux/filter.h b/include/linux/filter.h index de18e844d15e..3f6992261ec5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1546,6 +1546,8 @@ static __always_inline int __bpf_xdp_redirect_map(struct bpf_map *map, u64 index int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len); int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); +int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); +int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1558,6 +1560,18 @@ static inline int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, { return -EOPNOTSUPP; } + +static inline int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, + void *buf, u32 len) +{ + return -EOPNOTSUPP; +} + +static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, + void *buf, u32 len) +{ + return -EOPNOTSUPP; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From 66e3a13e7c2c44d0c9dd6bb244680ca7529a8845 Mon Sep 17 00:00:00 2001 From: Joanne Koong Date: Wed, 1 Mar 2023 07:49:52 -0800 Subject: bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr Two new kfuncs are added, bpf_dynptr_slice and bpf_dynptr_slice_rdwr. The user must pass in a buffer to store the contents of the data slice if a direct pointer to the data cannot be obtained. For skb and xdp type dynptrs, these two APIs are the only way to obtain a data slice. However, for other types of dynptrs, there is no difference between bpf_dynptr_slice(_rdwr) and bpf_dynptr_data. For skb type dynptrs, the data is copied into the user provided buffer if any of the data is not in the linear portion of the skb. For xdp type dynptrs, the data is copied into the user provided buffer if the data is between xdp frags. If the skb is cloned and a call to bpf_dynptr_data_rdwr is made, then the skb will be uncloned (see bpf_unclone_prologue()). Please note that any bpf_dynptr_write() automatically invalidates any prior data slices of the skb dynptr. This is because the skb may be cloned or may need to pull its paged buffer into the head. As such, any bpf_dynptr_write() will automatically have its prior data slices invalidated, even if the write is to data in the skb head of an uncloned skb. Please note as well that any other helper calls that change the underlying packet buffer (eg bpf_skb_pull_data()) invalidates any data slices of the skb dynptr as well, for the same reasons. Signed-off-by: Joanne Koong Link: https://lore.kernel.org/r/20230301154953.641654-10-joannelkoong@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index 3f6992261ec5..efa5d4a1677e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1548,6 +1548,9 @@ int __bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags); int __bpf_xdp_load_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, void *buf, u32 len); +void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len); +void bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, + void *buf, unsigned long len, bool flush); #else /* CONFIG_NET */ static inline int __bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) @@ -1572,6 +1575,17 @@ static inline int __bpf_xdp_store_bytes(struct xdp_buff *xdp, u32 offset, { return -EOPNOTSUPP; } + +static inline void *bpf_xdp_pointer(struct xdp_buff *xdp, u32 offset, u32 len) +{ + return NULL; +} + +static inline void *bpf_xdp_copy_buf(struct xdp_buff *xdp, unsigned long off, void *buf, + unsigned long len, bool flush) +{ + return NULL; +} #endif /* CONFIG_NET */ #endif /* __LINUX_FILTER_H__ */ -- cgit v1.2.3 From 9db44fdd8105da00669d425acab887c668df75f6 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Sat, 25 Feb 2023 16:40:09 +0100 Subject: bpf: Support kptrs in local storage maps Enable support for kptrs in local storage maps by wiring up the freeing of these kptrs from map value. Freeing of bpf_local_storage_map is only delayed in case there are special fields, therefore bpf_selem_free_* path can also only dereference smap safely in that case. This is recorded using a bool utilizing a hole in bpF_local_storage_elem. It could have been tagged in the pointer value smap using the lowest bit (since alignment > 1), but since there was already a hole I went with the simpler option. Only the map structure freeing is delayed using RCU barriers, as the buckets aren't used when selem is being freed, so they can be freed once all readers of the bucket lists can no longer access it. Cc: Martin KaFai Lau Cc: KP Singh Cc: Paul E. McKenney Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230225154010.391965-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 6d37a40cd90e..0fe92986412b 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -74,6 +74,12 @@ struct bpf_local_storage_elem { struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; + bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU + * callbacks? bpf_local_storage_map_free only + * executes rcu_barrier when there are special + * fields, this field remembers that to ensure we + * don't access already freed smap in sdata. + */ /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. -- cgit v1.2.3 From 20c09d92faeefb8536f705d3a4629e0dc314c8a1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:43 -0800 Subject: bpf: Introduce kptr_rcu. The life time of certain kernel structures like 'struct cgroup' is protected by RCU. Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps. The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU. Derefrence of other kptr-s returns PTR_UNTRUSTED. For example: struct map_value { struct cgroup __kptr *cgrp; }; SEC("tp_btf/cgroup_mkdir") int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path) { struct cgroup *cg, *cg2; cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0 bpf_kptr_xchg(&v->cgrp, cg); cg2 = v->cgrp; // This is new feature introduced by this patch. // cg2 is PTR_MAYBE_NULL | MEM_RCU. // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero if (cg2) bpf_cgroup_ancestor(cg2, level); // safe to do. } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Tejun Heo Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230303041446.3630-4-alexei.starovoitov@gmail.com --- include/linux/btf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf.h b/include/linux/btf.h index 49e0fe6d8274..556b3e2e7471 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -70,7 +70,7 @@ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ -#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ +#define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From 6fcd486b3a0a628c41f12b3a7329a18a2c74b351 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 2 Mar 2023 20:14:46 -0800 Subject: bpf: Refactor RCU enforcement in the verifier. bpf_rcu_read_lock/unlock() are only available in clang compiled kernels. Lack of such key mechanism makes it impossible for sleepable bpf programs to use RCU pointers. Allow bpf_rcu_read_lock/unlock() in GCC compiled kernels (though GCC doesn't support btf_type_tag yet) and allowlist certain field dereferences in important data structures like tast_struct, cgroup, socket that are used by sleepable programs either as RCU pointer or full trusted pointer (which is valid outside of RCU CS). Use BTF_TYPE_SAFE_RCU and BTF_TYPE_SAFE_TRUSTED macros for such tagging. They will be removed once GCC supports btf_type_tag. With that refactor check_ptr_to_btf_access(). Make it strict in enforcing PTR_TRUSTED and PTR_UNTRUSTED while deprecating old PTR_TO_BTF_ID without modifier flags. There is a chance that this strict enforcement might break existing programs (especially on GCC compiled kernels), but this cleanup has to start sooner than later. Note PTR_TO_CTX access still yields old deprecated PTR_TO_BTF_ID. Once it's converted to strict PTR_TRUSTED or PTR_UNTRUSTED the kfuncs and helpers will be able to default to KF_TRUSTED_ARGS. KF_RCU will remain as a weaker version of KF_TRUSTED_ARGS where obj refcnt could be 0. Adjust rcu_read_lock selftest to run on gcc and clang compiled kernels. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: David Vernet Link: https://lore.kernel.org/bpf/20230303041446.3630-7-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 2 +- include/linux/bpf_verifier.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 23ec684e660d..d3456804f7aa 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2279,7 +2279,7 @@ struct bpf_core_ctx { bool btf_nested_type_is_trusted(struct bpf_verifier_log *log, const struct bpf_reg_state *reg, - int off); + int off, const char *suffix); bool btf_type_ids_nocast_alias(struct bpf_verifier_log *log, const struct btf *reg_btf, u32 reg_id, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index b26ff2a8f63b..18538bad2b8c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -537,7 +537,6 @@ struct bpf_verifier_env { bool bypass_spec_v1; bool bypass_spec_v4; bool seen_direct_write; - bool rcu_tag_supported; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; struct bpf_verifier_log log; -- cgit v1.2.3 From e768e3c5aab44ee63f58649d4c8cbbb3270e5c06 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Fri, 3 Mar 2023 15:15:42 +0100 Subject: bpf: Use separate RCU callbacks for freeing selem Martin suggested that instead of using a byte in the hole (which he has a use for in his future patch) in bpf_local_storage_elem, we can dispatch a different call_rcu callback based on whether we need to free special fields in bpf_local_storage_elem data. The free path, described in commit 9db44fdd8105 ("bpf: Support kptrs in local storage maps"), only waits for call_rcu callbacks when there are special (kptrs, etc.) fields in the map value, hence it is necessary that we only access smap in this case. Therefore, dispatch different RCU callbacks based on the BPF map has a valid btf_record, which dereference and use smap's btf_record only when it is valid. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230303141542.300068-1-memxor@gmail.com Signed-off-by: Martin KaFai Lau --- include/linux/bpf_local_storage.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 0fe92986412b..6d37a40cd90e 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -74,12 +74,6 @@ struct bpf_local_storage_elem { struct hlist_node snode; /* Linked to bpf_local_storage */ struct bpf_local_storage __rcu *local_storage; struct rcu_head rcu; - bool can_use_smap; /* Is it safe to access smap in bpf_selem_free_* RCU - * callbacks? bpf_local_storage_map_free only - * executes rcu_barrier when there are special - * fields, this field remembers that to ensure we - * don't access already freed smap in sdata. - */ /* 8 bytes hole */ /* The data is stored in another cacheline to minimize * the number of cachelines access during a cache hit. -- cgit v1.2.3 From 2d5bcdcda8799cf21f9ab84598c946dd320207a2 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 7 Mar 2023 08:14:06 -0700 Subject: bpf: Increase size of BTF_ID_LIST without CONFIG_DEBUG_INFO_BTF again After commit 66e3a13e7c2c ("bpf: Add bpf_dynptr_slice and bpf_dynptr_slice_rdwr"), clang builds without CONFIG_DEBUG_INFO_BTF warn: kernel/bpf/verifier.c:10298:24: warning: array index 16 is past the end of the array (that has type 'u32[16]' (aka 'unsigned int[16]')) [-Warray-bounds] meta.func_id == special_kfunc_list[KF_bpf_dynptr_slice_rdwr]) { ^ ~~~~~~~~~~~~~~~~~~~~~~~~ kernel/bpf/verifier.c:9150:1: note: array 'special_kfunc_list' declared here BTF_ID_LIST(special_kfunc_list) ^ include/linux/btf_ids.h:207:27: note: expanded from macro 'BTF_ID_LIST' #define BTF_ID_LIST(name) static u32 __maybe_unused name[16]; ^ 1 warning generated. A warning of this nature was previously addressed by commit beb3d47d1d3d ("bpf: Fix a BTF_ID_LIST bug with CONFIG_DEBUG_INFO_BTF not set") but there have been new kfuncs added since then. Quadruple the size of the CONFIG_DEBUG_INFO_BTF=n definition so that this problem is unlikely to show up for some time. Link: https://github.com/ClangBuiltLinux/linux/issues/1810 Signed-off-by: Nathan Chancellor Reviewed-by: Tom Rix Link: https://lore.kernel.org/r/20230307-bpf-kfuncs-warray-bounds-v1-1-00ad3191f3a6@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/btf_ids.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/btf_ids.h b/include/linux/btf_ids.h index 3a4f7cd882ca..00950cc03bff 100644 --- a/include/linux/btf_ids.h +++ b/include/linux/btf_ids.h @@ -204,7 +204,7 @@ extern struct btf_id_set8 name; #else -#define BTF_ID_LIST(name) static u32 __maybe_unused name[16]; +#define BTF_ID_LIST(name) static u32 __maybe_unused name[64]; #define BTF_ID(prefix, name) #define BTF_ID_FLAGS(prefix, name, ...) #define BTF_ID_UNUSED -- cgit v1.2.3 From 90a5527d7686d3ebe0dd2a831356a6c7d7dc31bc Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:45:58 +0000 Subject: bpf: add new map ops ->map_mem_usage Add a new map ops ->map_mem_usage to print the memory usage of a bpf map. This is a preparation for the followup change. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-2-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d3456804f7aa..9059520bbb5e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -161,6 +161,8 @@ struct bpf_map_ops { bpf_callback_t callback_fn, void *callback_ctx, u64 flags); + u64 (*map_mem_usage)(const struct bpf_map *map); + /* BTF id of struct allocated by map_alloc */ int *map_btf_id; -- cgit v1.2.3 From 7490b7f1c02ef825ef98f7230662049d4a464a21 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:11 +0000 Subject: bpf, net: bpf_local_storage memory usage A new helper is introduced into bpf_local_storage map to calculate the memory usage. This helper is also used by other maps like bpf_cgrp_storage, bpf_inode_storage, bpf_task_storage and etc. Note that currently the dynamically allocated storage elements are not counted in the usage, since it will take extra runtime overhead in the elements update or delete path. So let's put it aside now, and implement it in the future when someone really need it. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-15-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 6d37a40cd90e..d934248b8e81 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -164,5 +164,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags); void bpf_local_storage_free_rcu(struct rcu_head *rcu); +u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); #endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From 9629363cd05642fe43aded44938adec067ad1da3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Sun, 5 Mar 2023 12:46:14 +0000 Subject: bpf: offload map memory usage A new helper is introduced to calculate offload map memory usage. But currently the memory dynamically allocated in netdev dev_ops, like nsim_map_update_elem, is not counted. Let's just put it aside now. Signed-off-by: Yafang Shao Link: https://lore.kernel.org/r/20230305124615.12358-18-laoar.shao@gmail.com Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9059520bbb5e..6792a7940e1e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -2624,6 +2624,7 @@ static inline bool bpf_map_is_offloaded(struct bpf_map *map) struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); void bpf_map_offload_map_free(struct bpf_map *map); +u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map); int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -2695,6 +2696,11 @@ static inline void bpf_map_offload_map_free(struct bpf_map *map) { } +static inline u64 bpf_map_offload_map_mem_usage(const struct bpf_map *map) +{ + return 0; +} + static inline int bpf_prog_test_run_syscall(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) -- cgit v1.2.3 From 0194b64578e905dc8f112e641a71c306bd58ddde Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Mon, 6 Mar 2023 22:51:35 +0100 Subject: net: phy: improve phy_read_poll_timeout cond sometimes is (val & MASK) what may result in a false positive if val is a negative errno. We shouldn't evaluate cond if val < 0. This has no functional impact here, but it's not nice. Therefore switch order of the checks. Signed-off-by: Heiner Kallweit Reviewed-by: Simon Horman Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/6d8274ac-4344-23b4-d9a3-cad4c39517d4@gmail.com Signed-off-by: Jakub Kicinski --- include/linux/phy.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/phy.h b/include/linux/phy.h index 36bf0bbc8efa..fefd5091bc24 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1130,16 +1130,15 @@ static inline int phy_read(struct phy_device *phydev, u32 regnum) #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \ timeout_us, sleep_before_read) \ ({ \ - int __ret = read_poll_timeout(phy_read, val, (cond) || val < 0, \ + int __ret = read_poll_timeout(phy_read, val, val < 0 || (cond), \ sleep_us, timeout_us, sleep_before_read, phydev, regnum); \ - if (val < 0) \ + if (val < 0) \ __ret = val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) - /** * __phy_read - convenience function for reading a given PHY register * @phydev: the phy_device struct -- cgit v1.2.3 From 40bbae583ec38ea31e728bf42a4ea72bded22ab6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Mar 2023 20:43:13 +0000 Subject: net: remove enum skb_free_reason enum skb_drop_reason is more generic, we can adopt it instead. Provide dev_kfree_skb_irq_reason() and dev_kfree_skb_any_reason(). This means drivers can use more precise drop reasons if they want to. Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Reviewed-by: Yunsheng Lin Link: https://lore.kernel.org/r/20230306204313.10492-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 6a14b7b11766..ee483071cf59 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -52,6 +52,7 @@ #include #include #include +#include struct netpoll_info; struct device; @@ -3804,13 +3805,8 @@ static inline unsigned int get_netdev_rx_queue_index( int netif_get_num_default_rss_queues(void); -enum skb_free_reason { - SKB_REASON_CONSUMED, - SKB_REASON_DROPPED, -}; - -void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason); -void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason); +void dev_kfree_skb_irq_reason(struct sk_buff *skb, enum skb_drop_reason reason); +void dev_kfree_skb_any_reason(struct sk_buff *skb, enum skb_drop_reason reason); /* * It is not allowed to call kfree_skb() or consume_skb() from hardware @@ -3833,22 +3829,22 @@ void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason); */ static inline void dev_kfree_skb_irq(struct sk_buff *skb) { - __dev_kfree_skb_irq(skb, SKB_REASON_DROPPED); + dev_kfree_skb_irq_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_irq(struct sk_buff *skb) { - __dev_kfree_skb_irq(skb, SKB_REASON_CONSUMED); + dev_kfree_skb_irq_reason(skb, SKB_CONSUMED); } static inline void dev_kfree_skb_any(struct sk_buff *skb) { - __dev_kfree_skb_any(skb, SKB_REASON_DROPPED); + dev_kfree_skb_any_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); } static inline void dev_consume_skb_any(struct sk_buff *skb) { - __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED); + dev_kfree_skb_any_reason(skb, SKB_CONSUMED); } u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, -- cgit v1.2.3 From 10369080454d87ee5b2db211ce947cb3118f0e13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Mar 2023 14:59:59 +0000 Subject: net: reclaim skb->scm_io_uring bit Commit 0091bfc81741 ("io_uring/af_unix: defer registered files gc to io_uring release") added one bit to struct sk_buff. This structure is critical for networking, and we try very hard to not add bloat on it, unless absolutely required. For instance, we can use a specific destructor as a wrapper around unix_destruct_scm(), to identify skbs that unix_gc() has to special case. Signed-off-by: Eric Dumazet Cc: Pavel Begunkov Cc: Thadeu Lima de Souza Cascardo Cc: Jens Axboe Reviewed-by: Jens Axboe Reviewed-by: Pavel Begunkov Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ff7ad331fb82..fe661011644b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -810,7 +810,6 @@ typedef unsigned char *sk_buff_data_t; * @csum_level: indicates the number of consecutive checksums found in * the packet minus one that have been verified as * CHECKSUM_UNNECESSARY (max 3) - * @scm_io_uring: SKB holds io_uring registered files * @dst_pending_confirm: need to confirm neighbour * @decrypted: Decrypted SKB * @slow_gro: state present at GRO time, slower prepare step required @@ -989,7 +988,6 @@ struct sk_buff { #endif __u8 slow_gro:1; __u8 csum_not_inet:1; - __u8 scm_io_uring:1; #ifdef CONFIG_NET_SCHED __u16 tc_index; /* traffic control index */ -- cgit v1.2.3 From 28e144cf5f72ce1c304571bc448e37c27495903a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 7 Mar 2023 16:31:30 -0500 Subject: netfilter: move br_nf_check_hbh_len to utils Rename br_nf_check_hbh_len() to nf_ip6_check_hbh_len() and move it to netfilter utils, so that it can be used by other modules, like ovs and tc. Signed-off-by: Xin Long Reviewed-by: Simon Horman Reviewed-by: Nikolay Aleksandrov Reviewed-by: Aaron Conole Signed-off-by: Florian Westphal --- include/linux/netfilter_ipv6.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h index 48314ade1506..7834c0be2831 100644 --- a/include/linux/netfilter_ipv6.h +++ b/include/linux/netfilter_ipv6.h @@ -197,6 +197,8 @@ static inline int nf_cookie_v6_check(const struct ipv6hdr *iph, __sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook, unsigned int dataoff, u_int8_t protocol); +int nf_ip6_check_hbh_len(struct sk_buff *skb, u32 *plen); + int ipv6_netfilter_init(void); void ipv6_netfilter_fini(void); -- cgit v1.2.3 From 215bf4962f6c9605710012fad222a5fec001b3ad Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:15 -0800 Subject: bpf: add iterator kfuncs registration and validation logic Add ability to register kfuncs that implement BPF open-coded iterator contract and enforce naming and function proto convention. Enforcement happens at the time of kfunc registration and significantly simplifies the rest of iterators logic in the verifier. More details follow in subsequent patches, but we enforce the following conditions. All kfuncs (constructor, next, destructor) have to be named consistenly as bpf_iter__{new,next,destroy}(), respectively. represents iterator type, and iterator state should be represented as a matching `struct bpf_iter_` state type. Also, all iter kfuncs should have a pointer to this `struct bpf_iter_` as the very first argument. Additionally: - Constructor, i.e., bpf_iter__new(), can have arbitrary extra number of arguments. Return type is not enforced either. - Next method, i.e., bpf_iter__next(), has to return a pointer type and should have exactly one argument: `struct bpf_iter_ *` (const/volatile/restrict and typedefs are ignored). - Destructor, i.e., bpf_iter__destroy(), should return void and should have exactly one argument, similar to the next method. - struct bpf_iter_ size is enforced to be positive and a multiple of 8 bytes (to fit stack slots correctly). Such strictness and consistency allows to build generic helpers abstracting important, but boilerplate, details to be able to use open-coded iterators effectively and ergonomically (see bpf_for_each() in subsequent patches). It also simplifies the verifier logic in some places. At the same time, this doesn't hurt generality of possible iterator implementations. Win-win. Constructor kfunc is marked with a new KF_ITER_NEW flags, next method is marked with KF_ITER_NEXT (and should also have KF_RET_NULL, of course), while destructor kfunc is marked as KF_ITER_DESTROY. Additionally, we add a trivial kfunc name validation: it should be a valid non-NULL and non-empty string. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 2 ++ include/linux/btf.h | 4 ++++ 2 files changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 18538bad2b8c..e2dc7f064449 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -59,6 +59,8 @@ struct bpf_active_lock { u32 id; }; +#define ITER_PREFIX "bpf_iter_" + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; diff --git a/include/linux/btf.h b/include/linux/btf.h index 556b3e2e7471..1bba0827e8c4 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -71,6 +71,10 @@ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */ +/* only one of KF_ITER_{NEW,NEXT,DESTROY} could be specified per kfunc */ +#define KF_ITER_NEW (1 << 8) /* kfunc implements BPF iter constructor */ +#define KF_ITER_NEXT (1 << 9) /* kfunc implements BPF iter next method */ +#define KF_ITER_DESTROY (1 << 10) /* kfunc implements BPF iter destructor */ /* * Tag marking a kernel function as a kfunc. This is meant to minimize the -- cgit v1.2.3 From 06accc8779c1d558a5b5a21f2ac82b0c95827ddd Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:16 -0800 Subject: bpf: add support for open-coded iterator loops Teach verifier about the concept of the open-coded (or inline) iterators. This patch adds generic iterator loop verification logic, new STACK_ITER stack slot type to contain iterator state, and necessary kfunc plumbing for iterator's constructor, destructor and next methods. Next patch implements first specific iterator (numbers iterator for implementing for() loop logic). Such split allows to have more focused commits for verifier logic and separate commit that we could point later to demonstrating what does it take to add a new kind of iterator. Each kind of iterator has its own associated struct bpf_iter_, where denotes a specific type of iterator. struct bpf_iter_ state is supposed to live on BPF program stack, so there will be no way to change its size later on without breaking backwards compatibility, so choose wisely! But given this struct is specific to a given of iterator, this allows a lot of flexibility: simple iterators could be fine with just one stack slot (8 bytes), like numbers iterator in the next patch, while some other more complicated iterators might need way more to keep their iterator state. Either way, such design allows to avoid runtime memory allocations, which otherwise would be necessary if we fixed on-the-stack size and it turned out to be too small for a given iterator implementation. The way BPF verifier logic is implemented, there are no artificial restrictions on a number of active iterators, it should work correctly using multiple active iterators at the same time. This also means you can have multiple nested iteration loops. struct bpf_iter_ reference can be safely passed to subprograms as well. General flow is easiest to demonstrate with a simple example using number iterator implemented in next patch. Here's the simplest possible loop: struct bpf_iter_num it; int *v; bpf_iter_num_new(&it, 2, 5); while ((v = bpf_iter_num_next(&it))) { bpf_printk("X = %d", *v); } bpf_iter_num_destroy(&it); Above snippet should output "X = 2", "X = 3", "X = 4". Note that 5 is exclusive and is not returned. This matches similar APIs (e.g., slices in Go or Rust) that implement a range of elements, where end index is non-inclusive. In the above example, we see a trio of function: - constructor, bpf_iter_num_new(), which initializes iterator state (struct bpf_iter_num it) on the stack. If any of the input arguments are invalid, constructor should make sure to still initialize it such that subsequent bpf_iter_num_next() calls will return NULL. I.e., on error, return error and construct empty iterator. - next method, bpf_iter_num_next(), which accepts pointer to iterator state and produces an element. Next method should always return a pointer. The contract between BPF verifier is that next method will always eventually return NULL when elements are exhausted. Once NULL is returned, subsequent next calls should keep returning NULL. In the case of numbers iterator, bpf_iter_num_next() returns a pointer to an int (storage for this integer is inside the iterator state itself), which can be dereferenced after corresponding NULL check. - once done with the iterator, it's mandated that user cleans up its state with the call to destructor, bpf_iter_num_destroy() in this case. Destructor frees up any resources and marks stack space used by struct bpf_iter_num as usable for something else. Any other iterator implementation will have to implement at least these three methods. It is enforced that for any given type of iterator only applicable constructor/destructor/next are callable. I.e., verifier ensures you can't pass number iterator state into, say, cgroup iterator's next method. It is important to keep the naming pattern consistent to be able to create generic macros to help with BPF iter usability. E.g., one of the follow up patches adds generic bpf_for_each() macro to bpf_misc.h in selftests, which allows to utilize iterator "trio" nicely without having to code the above somewhat tedious loop explicitly every time. This is enforced at kfunc registration point by one of the previous patches in this series. At the implementation level, iterator state tracking for verification purposes is very similar to dynptr. We add STACK_ITER stack slot type, reserve necessary number of slots, depending on sizeof(struct bpf_iter_), and keep track of necessary extra state in the "main" slot, which is marked with non-zero ref_obj_id. Other slots are also marked as STACK_ITER, but have zero ref_obj_id. This is simpler than having a separate "is_first_slot" flag. Another big distinction is that STACK_ITER is *always refcounted*, which simplifies implementation without sacrificing usability. So no need for extra "iter_id", no need to anticipate reuse of STACK_ITER slots for new constructors, etc. Keeping it simple here. As far as the verification logic goes, there are two extensive comments: in process_iter_next_call() and iter_active_depths_differ() explaining some important and sometimes subtle aspects. Please refer to them for details. But from 10,000-foot point of view, next methods are the points of forking a verification state, which are conceptually similar to what verifier is doing when validating conditional jump. We branch out at a `call bpf_iter__next` instruction and simulate two outcomes: NULL (iteration is done) and non-NULL (new element is returned). NULL is simulated first and is supposed to reach exit without looping. After that non-NULL case is validated and it either reaches exit (for trivial examples with no real loop), or reaches another `call bpf_iter__next` instruction with the state equivalent to already (partially) validated one. State equivalency at that point means we technically are going to be looping forever without "breaking out" out of established "state envelope" (i.e., subsequent iterations don't add any new knowledge or constraints to the verifier state, so running 1, 2, 10, or a million of them doesn't matter). But taking into account the contract stating that iterator next method *has to* return NULL eventually, we can conclude that loop body is safe and will eventually terminate. Given we validated logic outside of the loop (NULL case), and concluded that loop body is safe (though potentially looping many times), verifier can claim safety of the overall program logic. The rest of the patch is necessary plumbing for state tracking, marking, validation, and necessary further kfunc plumbing to allow implementing iterator constructor, destructor, and next methods. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e2dc7f064449..0c052bc79940 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -61,6 +61,12 @@ struct bpf_active_lock { #define ITER_PREFIX "bpf_iter_" +enum bpf_iter_state { + BPF_ITER_STATE_INVALID, /* for non-first slot */ + BPF_ITER_STATE_ACTIVE, + BPF_ITER_STATE_DRAINED, +}; + struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; @@ -105,6 +111,18 @@ struct bpf_reg_state { bool first_slot; } dynptr; + /* For bpf_iter stack slots */ + struct { + /* BTF container and BTF type ID describing + * struct bpf_iter_ of an iterator state + */ + struct btf *btf; + u32 btf_id; + /* packing following two fields to fit iter state into 16 bytes */ + enum bpf_iter_state state:2; + int depth:30; + } iter; + /* Max size from any of the above. */ struct { unsigned long raw1; @@ -143,6 +161,8 @@ struct bpf_reg_state { * same reference to the socket, to determine proper reference freeing. * For stack slots that are dynptrs, this is used to track references to * the dynptr to determine proper reference freeing. + * Similarly to dynptrs, we use ID to track "belonging" of a reference + * to a specific instance of bpf_iter. */ u32 id; /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned @@ -213,9 +233,11 @@ enum bpf_stack_slot_type { * is stored in bpf_stack_state->spilled_ptr.dynptr.type */ STACK_DYNPTR, + STACK_ITER, }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ + #define BPF_DYNPTR_SIZE sizeof(struct bpf_dynptr_kern) #define BPF_DYNPTR_NR_SLOTS (BPF_DYNPTR_SIZE / BPF_REG_SIZE) @@ -450,6 +472,7 @@ struct bpf_insn_aux_data { bool sanitize_stack_spill; /* subject to Spectre v4 sanitation */ bool zext_dst; /* this insn zero extends dst reg */ bool storage_get_func_atomic; /* bpf_*_storage_get() with atomic memory alloc */ + bool is_iter_next; /* bpf_iter__next() kfunc call */ u8 alu_state; /* used in combination with alu_limit */ /* below fields are initialized once */ -- cgit v1.2.3 From 6018e1f407cccf39b804d1f75ad4de7be4e6cc45 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 8 Mar 2023 10:41:17 -0800 Subject: bpf: implement numbers iterator Implement the first open-coded iterator type over a range of integers. It's public API consists of: - bpf_iter_num_new() constructor, which accepts [start, end) range (that is, start is inclusive, end is exclusive). - bpf_iter_num_next() which will keep returning read-only pointer to int until the range is exhausted, at which point NULL will be returned. If bpf_iter_num_next() is kept calling after this, NULL will be persistently returned. - bpf_iter_num_destroy() destructor, which needs to be called at some point to clean up iterator state. BPF verifier enforces that iterator destructor is called at some point before BPF program exits. Note that `start = end = X` is a valid combination to setup an empty iterator. bpf_iter_num_new() will return 0 (success) for any such combination. If bpf_iter_num_new() detects invalid combination of input arguments, it returns error, resets iterator state to, effectively, empty iterator, so any subsequent call to bpf_iter_num_next() will keep returning NULL. BPF verifier has no knowledge that returned integers are in the [start, end) value range, as both `start` and `end` are not statically known and enforced: they are runtime values. While the implementation is pretty trivial, some care needs to be taken to avoid overflows and underflows. Subsequent selftests will validate correctness of [start, end) semantics, especially around extremes (INT_MIN and INT_MAX). Similarly to bpf_loop(), we enforce that no more than BPF_MAX_LOOPS can be specified. bpf_iter_num_{new,next,destroy}() is a logical evolution from bounded BPF loops and bpf_loop() helper and is the basis for implementing ergonomic BPF loops with no statically known or verified bounds. Subsequent patches implement bpf_for() macro, demonstrating how this can be wrapped into something that works and feels like a normal for() loop in C language. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230308184121.1165081-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6792a7940e1e..e64ff1e89fb2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1617,8 +1617,12 @@ struct bpf_array { #define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ #define MAX_TAIL_CALL_CNT 33 -/* Maximum number of loops for bpf_loop */ -#define BPF_MAX_LOOPS BIT(23) +/* Maximum number of loops for bpf_loop and bpf_iter_num. + * It's enum to expose it (and thus make it discoverable) through BTF. + */ +enum { + BPF_MAX_LOOPS = 8 * 1024 * 1024, +}; #define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ BPF_F_RDONLY_PROG | \ -- cgit v1.2.3 From 6978052448f9eb19f7b03243ac0416104e5ee50d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Mar 2023 15:20:06 +0100 Subject: netlink: remove unused 'compare' function No users in the tree. Tested with allmodconfig build. Signed-off-by: Florian Westphal Link: https://lore.kernel.org/r/20230308142006.20879-1-fw@strlen.de Signed-off-by: Jakub Kicinski --- include/linux/netlink.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c43ac7690eca..3e8743252167 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -50,7 +50,6 @@ struct netlink_kernel_cfg { struct mutex *cb_mutex; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); - bool (*compare)(struct net *net, struct sock *sk); }; struct sock *__netlink_kernel_create(struct net *net, int unit, -- cgit v1.2.3 From 4b5ce570dbef57a20acdd71b0c65376009012354 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 9 Mar 2023 22:01:49 -0800 Subject: bpf: ensure state checkpointing at iter_next() call sites State equivalence check and checkpointing performed in is_state_visited() employs certain heuristics to try to save memory by avoiding state checkpoints if not enough jumps and instructions happened since last checkpoint. This leads to unpredictability of whether a particular instruction will be checkpointed and how regularly. While normally this is not causing much problems (except inconveniences for predictable verifier tests, which we overcome with BPF_F_TEST_STATE_FREQ flag), turns out it's not the case for open-coded iterators. Checking and saving state checkpoints at iter_next() call is crucial for fast convergence of open-coded iterator loop logic, so we need to force it. If we don't do that, is_state_visited() might skip saving a checkpoint, causing unnecessarily long sequence of not checkpointed instructions and jumps, leading to exhaustion of jump history buffer, and potentially other undesired outcomes. It is expected that with correct open-coded iterators convergence will happen quickly, so we don't run a risk of exhausting memory. This patch adds, in addition to prune and jump instruction marks, also a "forced checkpoint" mark, and makes sure that any iter_next() call instruction is marked as such. Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20230310060149.625887-1-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf_verifier.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0c052bc79940..81d525d057c7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -477,8 +477,12 @@ struct bpf_insn_aux_data { /* below fields are initialized once */ unsigned int orig_idx; /* original instruction index */ - bool prune_point; bool jmp_point; + bool prune_point; + /* ensure we check state equivalence and save state checkpoint and + * this instruction, regardless of any heuristics + */ + bool force_checkpoint; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ -- cgit v1.2.3 From 4cbd23cc92c49173e402753cab62b8a7754ed18f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:20 -0800 Subject: bpf: Move a few bpf_local_storage functions to static scope This patch moves the bpf_local_storage_free_rcu() and bpf_selem_unlink_map() to static because they are not used outside of bpf_local_storage.c. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-2-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index d934248b8e81..502ad7093f13 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -147,8 +147,6 @@ void bpf_selem_unlink(struct bpf_local_storage_elem *selem, bool use_trace_rcu); void bpf_selem_link_map(struct bpf_local_storage_map *smap, struct bpf_local_storage_elem *selem); -void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem); - struct bpf_local_storage_elem * bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value, bool charge_mem, gfp_t gfp_flags); @@ -163,7 +161,6 @@ struct bpf_local_storage_data * bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap, void *value, u64 map_flags, gfp_t gfp_flags); -void bpf_local_storage_free_rcu(struct rcu_head *rcu); u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map); #endif /* _BPF_LOCAL_STORAGE_H */ -- cgit v1.2.3 From 2ffcb6fc50174d1efc8f98633eb2647d84483c68 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:21 -0800 Subject: bpf: Refactor codes into bpf_local_storage_destroy This patch first renames bpf_local_storage_unlink_nolock to bpf_local_storage_destroy(). It better reflects that it is only used when the storage's owner (sk/task/cgrp/inode) is being kfree(). All bpf_local_storage_destroy's caller is taking the spin lock and then free the storage. This patch also moves these two steps into the bpf_local_storage_destroy. This is a preparation work for a later patch that uses bpf_mem_cache_alloc/free in the bpf_local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-3-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 502ad7093f13..5908a954ddc2 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -128,7 +128,7 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage, struct bpf_local_storage_map *smap, bool cacheit_lockit); -bool bpf_local_storage_unlink_nolock(struct bpf_local_storage *local_storage); +void bpf_local_storage_destroy(struct bpf_local_storage *local_storage); void bpf_local_storage_map_free(struct bpf_map *map, struct bpf_local_storage_cache *cache, -- cgit v1.2.3 From fc6652aab6ad545de70b772550da9043d0b47f1c Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Tue, 7 Mar 2023 22:59:24 -0800 Subject: bpf: Remember smap in bpf_local_storage This patch remembers which smap triggers the allocation of a 'struct bpf_local_storage' object. The local_storage is allocated during the very first selem added to the owner. The smap pointer is needed when using the bpf_mem_cache_free in a later patch because it needs to free to the correct smap's bpf_mem_alloc object. When a selem is being removed, it needs to check if it is the selem that triggers the creation of the local_storage. If it is, the local_storage->smap pointer will be reset to NULL. This NULL reset is done under the local_storage->lock in bpf_selem_unlink_storage_nolock() when a selem is being removed. Also note that the local_storage may not go away even local_storage->smap is NULL because there may be other selem still stored in the local_storage. Signed-off-by: Martin KaFai Lau Link: https://lore.kernel.org/r/20230308065936.1550103-6-martin.lau@linux.dev Signed-off-by: Alexei Starovoitov --- include/linux/bpf_local_storage.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h index 5908a954ddc2..613b1805ed9f 100644 --- a/include/linux/bpf_local_storage.h +++ b/include/linux/bpf_local_storage.h @@ -83,6 +83,7 @@ struct bpf_local_storage_elem { struct bpf_local_storage { struct bpf_local_storage_data __rcu *cache[BPF_LOCAL_STORAGE_CACHE