From 146b6f1112eb30a19776d6c323c994e9d67790db Mon Sep 17 00:00:00 2001 From: Jinghao Jia Date: Sat, 23 Nov 2024 03:42:56 -0600 Subject: ipvs: fix UB due to uninitialized stack access in ip_vs_protocol_init() Under certain kernel configurations when building with Clang/LLVM, the compiler does not generate a return or jump as the terminator instruction for ip_vs_protocol_init(), triggering the following objtool warning during build time: vmlinux.o: warning: objtool: ip_vs_protocol_init() falls through to next function __initstub__kmod_ip_vs_rr__935_123_ip_vs_rr_init6() At runtime, this either causes an oops when trying to load the ipvs module or a boot-time panic if ipvs is built-in. This same issue has been reported by the Intel kernel test robot previously. Digging deeper into both LLVM and the kernel code reveals this to be a undefined behavior problem. ip_vs_protocol_init() uses a on-stack buffer of 64 chars to store the registered protocol names and leaves it uninitialized after definition. The function calls strnlen() when concatenating protocol names into the buffer. With CONFIG_FORTIFY_SOURCE strnlen() performs an extra step to check whether the last byte of the input char buffer is a null character (commit 3009f891bb9f ("fortify: Allow strlen() and strnlen() to pass compile-time known lengths")). This, together with possibly other configurations, cause the following IR to be generated: define hidden i32 @ip_vs_protocol_init() local_unnamed_addr #5 section ".init.text" align 16 !kcfi_type !29 { %1 = alloca [64 x i8], align 16 ... 14: ; preds = %11 %15 = getelementptr inbounds i8, ptr %1, i64 63 %16 = load i8, ptr %15, align 1 %17 = tail call i1 @llvm.is.constant.i8(i8 %16) %18 = icmp eq i8 %16, 0 %19 = select i1 %17, i1 %18, i1 false br i1 %19, label %20, label %23 20: ; preds = %14 %21 = call i64 @strlen(ptr noundef nonnull dereferenceable(1) %1) #23 ... 23: ; preds = %14, %11, %20 %24 = call i64 @strnlen(ptr noundef nonnull dereferenceable(1) %1, i64 noundef 64) #24 ... } The above code calculates the address of the last char in the buffer (value %15) and then loads from it (value %16). Because the buffer is never initialized, the LLVM GVN pass marks value %16 as undefined: %13 = getelementptr inbounds i8, ptr %1, i64 63 br i1 undef, label %14, label %17 This gives later passes (SCCP, in particular) more DCE opportunities by propagating the undef value further, and eventually removes everything after the load on the uninitialized stack location: define hidden i32 @ip_vs_protocol_init() local_unnamed_addr #0 section ".init.text" align 16 !kcfi_type !11 { %1 = alloca [64 x i8], align 16 ... 12: ; preds = %11 %13 = getelementptr inbounds i8, ptr %1, i64 63 unreachable } In this way, the generated native code will just fall through to the next function, as LLVM does not generate any code for the unreachable IR instruction and leaves the function without a terminator. Zero the on-stack buffer to avoid this possible UB. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202402100205.PWXIz1ZK-lkp@intel.com/ Co-developed-by: Ruowen Qin Signed-off-by: Ruowen Qin Signed-off-by: Jinghao Jia Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_proto.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index f100da4ba3bc..a9fd1d3fc2cb 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -340,7 +340,7 @@ void __net_exit ip_vs_protocol_net_cleanup(struct netns_ipvs *ipvs) int __init ip_vs_protocol_init(void) { - char protocols[64]; + char protocols[64] = { 0 }; #define REGISTER_PROTOCOL(p) \ do { \ register_ip_vs_protocol(p); \ @@ -348,8 +348,6 @@ int __init ip_vs_protocol_init(void) strcat(protocols, (p)->name); \ } while (0) - protocols[0] = '\0'; - protocols[2] = '\0'; #ifdef CONFIG_IP_VS_PROTO_TCP REGISTER_PROTOCOL(&ip_vs_protocol_tcp); #endif -- cgit v1.2.3 From 04317f4eb2aad312ad85c1a17ad81fe75f1f9bc7 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 21 Nov 2024 09:55:42 +0300 Subject: netfilter: x_tables: fix LED ID check in led_tg_check() Syzbot has reported the following BUG detected by KASAN: BUG: KASAN: slab-out-of-bounds in strlen+0x58/0x70 Read of size 1 at addr ffff8881022da0c8 by task repro/5879 ... Call Trace: dump_stack_lvl+0x241/0x360 ? __pfx_dump_stack_lvl+0x10/0x10 ? __pfx__printk+0x10/0x10 ? _printk+0xd5/0x120 ? __virt_addr_valid+0x183/0x530 ? __virt_addr_valid+0x183/0x530 print_report+0x169/0x550 ? __virt_addr_valid+0x183/0x530 ? __virt_addr_valid+0x183/0x530 ? __virt_addr_valid+0x45f/0x530 ? __phys_addr+0xba/0x170 ? strlen+0x58/0x70 kasan_report+0x143/0x180 ? strlen+0x58/0x70 strlen+0x58/0x70 kstrdup+0x20/0x80 led_tg_check+0x18b/0x3c0 xt_check_target+0x3bb/0xa40 ? __pfx_xt_check_target+0x10/0x10 ? stack_depot_save_flags+0x6e4/0x830 ? nft_target_init+0x174/0xc30 nft_target_init+0x82d/0xc30 ? __pfx_nft_target_init+0x10/0x10 ? nf_tables_newrule+0x1609/0x2980 ? nf_tables_newrule+0x1609/0x2980 ? rcu_is_watching+0x15/0xb0 ? nf_tables_newrule+0x1609/0x2980 ? nf_tables_newrule+0x1609/0x2980 ? __kmalloc_noprof+0x21a/0x400 nf_tables_newrule+0x1860/0x2980 ? __pfx_nf_tables_newrule+0x10/0x10 ? __nla_parse+0x40/0x60 nfnetlink_rcv+0x14e5/0x2ab0 ? __pfx_validate_chain+0x10/0x10 ? __pfx_nfnetlink_rcv+0x10/0x10 ? __lock_acquire+0x1384/0x2050 ? netlink_deliver_tap+0x2e/0x1b0 ? __pfx_lock_release+0x10/0x10 ? netlink_deliver_tap+0x2e/0x1b0 netlink_unicast+0x7f8/0x990 ? __pfx_netlink_unicast+0x10/0x10 ? __virt_addr_valid+0x183/0x530 ? __check_object_size+0x48e/0x900 netlink_sendmsg+0x8e4/0xcb0 ? __pfx_netlink_sendmsg+0x10/0x10 ? aa_sock_msg_perm+0x91/0x160 ? __pfx_netlink_sendmsg+0x10/0x10 __sock_sendmsg+0x223/0x270 ____sys_sendmsg+0x52a/0x7e0 ? __pfx_____sys_sendmsg+0x10/0x10 __sys_sendmsg+0x292/0x380 ? __pfx___sys_sendmsg+0x10/0x10 ? lockdep_hardirqs_on_prepare+0x43d/0x780 ? __pfx_lockdep_hardirqs_on_prepare+0x10/0x10 ? exc_page_fault+0x590/0x8c0 ? do_syscall_64+0xb6/0x230 do_syscall_64+0xf3/0x230 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... Since an invalid (without '\0' byte at all) byte sequence may be passed from userspace, add an extra check to ensure that such a sequence is rejected as possible ID and so never passed to 'kstrdup()' and further. Reported-by: syzbot+6c8215822f35fdb35667@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=6c8215822f35fdb35667 Fixes: 268cb38e1802 ("netfilter: x_tables: add LED trigger target") Signed-off-by: Dmitry Antipov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_LED.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_LED.c b/net/netfilter/xt_LED.c index f7b0286d106a..8a80fd76fe45 100644 --- a/net/netfilter/xt_LED.c +++ b/net/netfilter/xt_LED.c @@ -96,7 +96,9 @@ static int led_tg_check(const struct xt_tgchk_param *par) struct xt_led_info_internal *ledinternal; int err; - if (ledinfo->id[0] == '\0') + /* Bail out if empty string or not a string at all. */ + if (ledinfo->id[0] == '\0' || + !memchr(ledinfo->id, '\0', sizeof(ledinfo->id))) return -EINVAL; mutex_lock(&xt_led_mutex); -- cgit v1.2.3 From b7529880cb961d515642ce63f9d7570869bbbdc3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 26 Nov 2024 11:59:06 +0100 Subject: netfilter: nft_socket: remove WARN_ON_ONCE on maximum cgroup level cgroup maximum depth is INT_MAX by default, there is a cgroup toggle to restrict this maximum depth to a more reasonable value not to harm performance. Remove unnecessary WARN_ON_ONCE which is reachable from userspace. Fixes: 7f3287db6543 ("netfilter: nft_socket: make cgroupsv2 matching work with namespaces") Reported-by: syzbot+57bac0866ddd99fe47c0@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index f5da0c1775f2..35d0409b0095 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -68,7 +68,7 @@ static noinline int nft_socket_cgroup_subtree_level(void) cgroup_put(cgrp); - if (WARN_ON_ONCE(level > 255)) + if (level > 255) return -ERANGE; if (WARN_ON_ONCE(level < 0)) -- cgit v1.2.3 From 0a4cc4accf00b49b4728bb7639cb90a6a5b674e2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 25 Nov 2024 09:30:39 +0000 Subject: tcp: populate XPS related fields of timewait sockets syzbot reported that netdev_core_pick_tx() was reading an uninitialized field [1]. This is indeed hapening for timewait sockets after recent commits. We can copy the original established socket sk_tx_queue_mapping and sk_rx_queue_mapping fields, instead of adding more checks in fast paths. As a bonus, packets will use the same transmit queue than prior ones, this potentially can avoid reordering. [1] BUG: KMSAN: uninit-value in netdev_pick_tx+0x5c7/0x1550 netdev_pick_tx+0x5c7/0x1550 netdev_core_pick_tx+0x1d2/0x4a0 net/core/dev.c:4312 __dev_queue_xmit+0x128a/0x57d0 net/core/dev.c:4394 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip_finish_output2+0x187c/0x1b70 net/ipv4/ip_output.c:236 __ip_finish_output+0x287/0x810 ip_finish_output+0x4b/0x600 net/ipv4/ip_output.c:324 NF_HOOK_COND include/linux/netfilter.h:303 [inline] ip_output+0x15f/0x3f0 net/ipv4/ip_output.c:434 dst_output include/net/dst.h:450 [inline] ip_local_out net/ipv4/ip_output.c:130 [inline] ip_send_skb net/ipv4/ip_output.c:1505 [inline] ip_push_pending_frames+0x444/0x570 net/ipv4/ip_output.c:1525 ip_send_unicast_reply+0x18c1/0x1b30 net/ipv4/ip_output.c:1672 tcp_v4_send_reset+0x238d/0x2a40 net/ipv4/tcp_ipv4.c:910 tcp_v4_rcv+0x48f8/0x5750 net/ipv4/tcp_ipv4.c:2431 ip_protocol_deliver_rcu+0x2a3/0x13d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x336/0x500 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:314 [inline] ip_local_deliver+0x21f/0x490 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:578 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:628 [inline] ip_sublist_rcv+0x15f3/0x17f0 net/ipv4/ip_input.c:636 ip_list_rcv+0x9ef/0xa40 net/ipv4/ip_input.c:670 __netif_receive_skb_list_ptype net/core/dev.c:5715 [inline] __netif_receive_skb_list_core+0x15c5/0x1670 net/core/dev.c:5762 __netif_receive_skb_list net/core/dev.c:5814 [inline] netif_receive_skb_list_internal+0x1085/0x1700 net/core/dev.c:5905 gro_normal_list include/net/gro.h:515 [inline] napi_complete_done+0x3d4/0x810 net/core/dev.c:6256 virtqueue_napi_complete drivers/net/virtio_net.c:758 [inline] virtnet_poll+0x5d80/0x6bf0 drivers/net/virtio_net.c:3013 __napi_poll+0xe7/0x980 net/core/dev.c:6877 napi_poll net/core/dev.c:6946 [inline] net_rx_action+0xa5a/0x19b0 net/core/dev.c:7068 handle_softirqs+0x1a0/0x7c0 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0x68/0x180 kernel/softirq.c:655 irq_exit_rcu+0x12/0x20 kernel/softirq.c:671 common_interrupt+0x97/0xb0 arch/x86/kernel/irq.c:278 asm_common_interrupt+0x2b/0x40 arch/x86/include/asm/idtentry.h:693 __preempt_count_sub arch/x86/include/asm/preempt.h:84 [inline] kmsan_virt_addr_valid arch/x86/include/asm/kmsan.h:95 [inline] virt_to_page_or_null+0xfb/0x150 mm/kmsan/shadow.c:75 kmsan_get_metadata+0x13e/0x1c0 mm/kmsan/shadow.c:141 kmsan_get_shadow_origin_ptr+0x4d/0xb0 mm/kmsan/shadow.c:102 get_shadow_origin_ptr mm/kmsan/instrumentation.c:38 [inline] __msan_metadata_ptr_for_store_4+0x27/0x40 mm/kmsan/instrumentation.c:93 rcu_preempt_read_enter kernel/rcu/tree_plugin.h:390 [inline] __rcu_read_lock+0x46/0x70 kernel/rcu/tree_plugin.h:413 rcu_read_lock include/linux/rcupdate.h:847 [inline] batadv_nc_purge_orig_hash net/batman-adv/network-coding.c:408 [inline] batadv_nc_worker+0x114/0x19e0 net/batman-adv/network-coding.c:719 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xae0/0x1c40 kernel/workqueue.c:3310 worker_thread+0xea7/0x14f0 kernel/workqueue.c:3391 kthread+0x3e2/0x540 kernel/kthread.c:389 ret_from_fork+0x6d/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Uninit was created at: __alloc_pages_noprof+0x9a7/0xe00 mm/page_alloc.c:4774 alloc_pages_mpol_noprof+0x299/0x990 mm/mempolicy.c:2265 alloc_pages_noprof+0x1bf/0x1e0 mm/mempolicy.c:2344 alloc_slab_page mm/slub.c:2412 [inline] allocate_slab+0x320/0x12e0 mm/slub.c:2578 new_slab mm/slub.c:2631 [inline] ___slab_alloc+0x12ef/0x35e0 mm/slub.c:3818 __slab_alloc mm/slub.c:3908 [inline] __slab_alloc_node mm/slub.c:3961 [inline] slab_alloc_node mm/slub.c:4122 [inline] kmem_cache_alloc_noprof+0x57a/0xb20 mm/slub.c:4141 inet_twsk_alloc+0x11f/0x9d0 net/ipv4/inet_timewait_sock.c:188 tcp_time_wait+0x83/0xf50 net/ipv4/tcp_minisocks.c:309 tcp_rcv_state_process+0x145a/0x49d0 tcp_v4_do_rcv+0xbf9/0x11a0 net/ipv4/tcp_ipv4.c:1939 tcp_v4_rcv+0x51df/0x5750 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x2a3/0x13d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x336/0x500 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:314 [inline] ip_local_deliver+0x21f/0x490 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:460 [inline] ip_sublist_rcv_finish net/ipv4/ip_input.c:578 [inline] ip_list_rcv_finish net/ipv4/ip_input.c:628 [inline] ip_sublist_rcv+0x15f3/0x17f0 net/ipv4/ip_input.c:636 ip_list_rcv+0x9ef/0xa40 net/ipv4/ip_input.c:670 __netif_receive_skb_list_ptype net/core/dev.c:5715 [inline] __netif_receive_skb_list_core+0x15c5/0x1670 net/core/dev.c:5762 __netif_receive_skb_list net/core/dev.c:5814 [inline] netif_receive_skb_list_internal+0x1085/0x1700 net/core/dev.c:5905 gro_normal_list include/net/gro.h:515 [inline] napi_complete_done+0x3d4/0x810 net/core/dev.c:6256 virtqueue_napi_complete drivers/net/virtio_net.c:758 [inline] virtnet_poll+0x5d80/0x6bf0 drivers/net/virtio_net.c:3013 __napi_poll+0xe7/0x980 net/core/dev.c:6877 napi_poll net/core/dev.c:6946 [inline] net_rx_action+0xa5a/0x19b0 net/core/dev.c:7068 handle_softirqs+0x1a0/0x7c0 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu+0x68/0x180 kernel/softirq.c:655 irq_exit_rcu+0x12/0x20 kernel/softirq.c:671 common_interrupt+0x97/0xb0 arch/x86/kernel/irq.c:278 asm_common_interrupt+0x2b/0x40 arch/x86/include/asm/idtentry.h:693 CPU: 0 UID: 0 PID: 3962 Comm: kworker/u8:18 Not tainted 6.12.0-syzkaller-09073-g9f16d5e6f220 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: bat_events batadv_nc_worker Fixes: 79636038d37e ("ipv4: tcp: give socket pointer to control skbs") Fixes: 507a96737d99 ("ipv6: tcp: give socket pointer to control skbs") Reported-by: syzbot+8b0959fc16551d55896b@syzkaller.appspotmail.com Link: https://lore.kernel.org/netdev/674442bd.050a0220.1cc393.0072.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Reviewed-by: Brian Vazquez Link: https://patch.msgid.link/20241125093039.3095790-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_minisocks.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bb1fe1ba867a..7121d8573928 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -326,6 +326,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; tw->tw_txhash = sk->sk_txhash; + tw->tw_tx_queue_mapping = sk->sk_tx_queue_mapping; +#ifdef CONFIG_SOCK_RX_QUEUE_MAPPING + tw->tw_rx_queue_mapping = sk->sk_rx_queue_mapping; +#endif #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); -- cgit v1.2.3 From 1596a135e3180c92e42dd1fbcad321f4fb3e3b17 Mon Sep 17 00:00:00 2001 From: Martin Ottens Date: Mon, 25 Nov 2024 18:46:07 +0100 Subject: net/sched: tbf: correct backlog statistic for GSO packets When the length of a GSO packet in the tbf qdisc is larger than the burst size configured the packet will be segmented by the tbf_segment function. Whenever this function is used to enqueue SKBs, the backlog statistic of the tbf is not increased correctly. This can lead to underflows of the 'backlog' byte-statistic value when these packets are dequeued from tbf. Reproduce the bug: Ensure that the sender machine has GSO enabled. Configured the tbf on the outgoing interface of the machine as follows (burstsize = 1 MTU): $ tc qdisc add dev root handle 1: tbf rate 50Mbit burst 1514 latency 50ms Send bulk TCP traffic out via this interface, e.g., by running an iPerf3 client on this machine. Check the qdisc statistics: $ tc -s qdisc show dev The 'backlog' byte-statistic has incorrect values while traffic is transferred, e.g., high values due to u32 underflows. When the transfer is stopped, the value is != 0, which should never happen. This patch fixes this bug by updating the statistics correctly, even if single SKBs of a GSO SKB cannot be enqueued. Fixes: e43ac79a4bc6 ("sch_tbf: segment too big GSO packets") Signed-off-by: Martin Ottens Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241125174608.1484356-1-martin.ottens@fau.de Signed-off-by: Jakub Kicinski --- net/sched/sch_tbf.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index f1d09183ae63..dc26b22d53c7 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -208,7 +208,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, struct tbf_sched_data *q = qdisc_priv(sch); struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); - unsigned int len = 0, prev_len = qdisc_pkt_len(skb); + unsigned int len = 0, prev_len = qdisc_pkt_len(skb), seg_len; int ret, nb; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); @@ -219,21 +219,27 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, nb = 0; skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); - qdisc_skb_cb(segs)->pkt_len = segs->len; - len += segs->len; + seg_len = segs->len; + qdisc_skb_cb(segs)->pkt_len = seg_len; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { nb++; + len += seg_len; } } sch->q.qlen += nb; - if (nb > 1) + sch->qstats.backlog += len; + if (nb > 0) { qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len); - consume_skb(skb); - return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; + consume_skb(skb); + return NET_XMIT_SUCCESS; + } + + kfree_skb(skb); + return NET_XMIT_DROP; } static int tbf_enqueue(struct sk_buff *skb, struct Qdisc *sch, -- cgit v1.2.3 From b9653d19e556c6afd035602927a93d100a0d7644 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Nov 2024 14:43:44 +0000 Subject: net: hsr: avoid potential out-of-bound access in fill_frame_info() syzbot is able to feed a packet with 14 bytes, pretending it is a vlan one. Since fill_frame_info() is relying on skb->mac_len already, extend the check to cover this case. BUG: KMSAN: uninit-value in fill_frame_info net/hsr/hsr_forward.c:709 [inline] BUG: KMSAN: uninit-value in hsr_forward_skb+0x9ee/0x3b10 net/hsr/hsr_forward.c:724 fill_frame_info net/hsr/hsr_forward.c:709 [inline] hsr_forward_skb+0x9ee/0x3b10 net/hsr/hsr_forward.c:724 hsr_dev_xmit+0x2f0/0x350 net/hsr/hsr_device.c:235 __netdev_start_xmit include/linux/netdevice.h:5002 [inline] netdev_start_xmit include/linux/netdevice.h:5011 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x247/0xa20 net/core/dev.c:3606 __dev_queue_xmit+0x366a/0x57d0 net/core/dev.c:4434 dev_queue_xmit include/linux/netdevice.h:3168 [inline] packet_xmit+0x9c/0x6c0 net/packet/af_packet.c:276 packet_snd net/packet/af_packet.c:3146 [inline] packet_sendmsg+0x91ae/0xa6f0 net/packet/af_packet.c:3178 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:726 __sys_sendto+0x594/0x750 net/socket.c:2197 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2200 x64_sys_call+0x346a/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:45 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Uninit was created at: slab_post_alloc_hook mm/slub.c:4091 [inline] slab_alloc_node mm/slub.c:4134 [inline] kmem_cache_alloc_node_noprof+0x6bf/0xb80 mm/slub.c:4186 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:587 __alloc_skb+0x363/0x7b0 net/core/skbuff.c:678 alloc_skb include/linux/skbuff.h:1323 [inline] alloc_skb_with_frags+0xc8/0xd00 net/core/skbuff.c:6612 sock_alloc_send_pskb+0xa81/0xbf0 net/core/sock.c:2881 packet_alloc_skb net/packet/af_packet.c:2995 [inline] packet_snd net/packet/af_packet.c:3089 [inline] packet_sendmsg+0x74c6/0xa6f0 net/packet/af_packet.c:3178 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg+0x30f/0x380 net/socket.c:726 __sys_sendto+0x594/0x750 net/socket.c:2197 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __x64_sys_sendto+0x125/0x1d0 net/socket.c:2200 x64_sys_call+0x346a/0x3c30 arch/x86/include/generated/asm/syscalls_64.h:45 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x1e0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 48b491a5cc74 ("net: hsr: fix mac_len checks") Reported-by: syzbot+671e2853f9851d039551@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6745dc7f.050a0220.21d33d.0018.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: WingMan Kwok Cc: Murali Karicheri Cc: MD Danish Anwar Cc: Jiri Pirko Cc: George McCollister Link: https://patch.msgid.link/20241126144344.4177332-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/hsr/hsr_forward.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index aa6acebc7c1e..87bb3a91598e 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -700,6 +700,8 @@ static int fill_frame_info(struct hsr_frame_info *frame, frame->is_vlan = true; if (frame->is_vlan) { + if (skb->mac_len < offsetofend(struct hsr_vlan_ethhdr, vlanhdr)) + return -EINVAL; vlan_hdr = (struct hsr_vlan_ethhdr *)ethhdr; proto = vlan_hdr->vlanhdr.h_vlan_encapsulated_proto; } -- cgit v1.2.3 From c44daa7e3c73229f7ac74985acb8c7fb909c4e0a Mon Sep 17 00:00:00 2001 From: Dong Chenchen Date: Wed, 27 Nov 2024 12:08:50 +0800 Subject: net: Fix icmp host relookup triggering ip_rt_bug arp link failure may trigger ip_rt_bug while xfrm enabled, call trace is: WARNING: CPU: 0 PID: 0 at net/ipv4/route.c:1241 ip_rt_bug+0x14/0x20 Modules linked in: CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.0-rc6-00077-g2e1b3cc9d7f7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 RIP: 0010:ip_rt_bug+0x14/0x20 Call Trace: ip_send_skb+0x14/0x40 __icmp_send+0x42d/0x6a0 ipv4_link_failure+0xe2/0x1d0 arp_error_report+0x3c/0x50 neigh_invalidate+0x8d/0x100 neigh_timer_handler+0x2e1/0x330 call_timer_fn+0x21/0x120 __run_timer_base.part.0+0x1c9/0x270 run_timer_softirq+0x4c/0x80 handle_softirqs+0xac/0x280 irq_exit_rcu+0x62/0x80 sysvec_apic_timer_interrupt+0x77/0x90 The script below reproduces this scenario: ip xfrm policy add src 0.0.0.0/0 dst 0.0.0.0/0 \ dir out priority 0 ptype main flag localok icmp ip l a veth1 type veth ip a a 192.168.141.111/24 dev veth0 ip l s veth0 up ping 192.168.141.155 -c 1 icmp_route_lookup() create input routes for locally generated packets while xfrm relookup ICMP traffic.Then it will set input route (dst->out = ip_rt_bug) to skb for DESTUNREACH. For ICMP err triggered by locally generated packets, dst->dev of output route is loopback. Generally, xfrm relookup verification is not required on loopback interfaces (net.ipv4.conf.lo.disable_xfrm = 1). Skip icmp relookup for locally generated packets to fix it. Fixes: 8b7817f3a959 ("[IPSEC]: Add ICMP host relookup support") Signed-off-by: Dong Chenchen Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241127040850.1513135-1-dongchenchen2@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4f088fa1c2f2..963a89ae9c26 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -517,6 +517,9 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, if (!IS_ERR(dst)) { if (rt != rt2) return rt; + if (inet_addr_type_dev_table(net, route_lookup_dev, + fl4->daddr) == RTN_LOCAL) + return rt; } else if (PTR_ERR(dst) == -EPERM) { rt = NULL; } else { -- cgit v1.2.3 From a747e02430dfb3657141f99aa6b09331283fa493 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 26 Nov 2024 19:28:27 +0000 Subject: ipv6: avoid possible NULL deref in modify_prefix_route() syzbot found a NULL deref [1] in modify_prefix_route(), caused by one fib6_info without a fib6_table pointer set. This can happen for net->ipv6.fib6_null_entry [1] Oops: general protection fault, probably for non-canonical address 0xdffffc0000000006: 0000 [#1] PREEMPT SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000030-0x0000000000000037] CPU: 1 UID: 0 PID: 5837 Comm: syz-executor888 Not tainted 6.12.0-syzkaller-09567-g7eef7e306d3c #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__lock_acquire+0xe4/0x3c40 kernel/locking/lockdep.c:5089 Code: 08 84 d2 0f 85 15 14 00 00 44 8b 0d ca 98 f5 0e 45 85 c9 0f 84 b4 0e 00 00 48 b8 00 00 00 00 00 fc ff df 4c 89 e2 48 c1 ea 03 <80> 3c 02 00 0f 85 96 2c 00 00 49 8b 04 24 48 3d a0 07 7f 93 0f 84 RSP: 0018:ffffc900035d7268 EFLAGS: 00010006 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000006 RSI: 1ffff920006bae5f RDI: 0000000000000030 RBP: 0000000000000000 R08: 0000000000000001 R09: 0000000000000001 R10: ffffffff90608e17 R11: 0000000000000001 R12: 0000000000000030 R13: ffff888036334880 R14: 0000000000000000 R15: 0000000000000000 FS: 0000555579e90380(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffc59cc4278 CR3: 0000000072b54000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: lock_acquire.part.0+0x11b/0x380 kernel/locking/lockdep.c:5849 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:126 [inline] _raw_spin_lock_bh+0x33/0x40 kernel/locking/spinlock.c:178 spin_lock_bh include/linux/spinlock.h:356 [inline] modify_prefix_route+0x30b/0x8b0 net/ipv6/addrconf.c:4831 inet6_addr_modify net/ipv6/addrconf.c:4923 [inline] inet6_rtm_newaddr+0x12c7/0x1ab0 net/ipv6/addrconf.c:5055 rtnetlink_rcv_msg+0x3c7/0xea0 net/core/rtnetlink.c:6920 netlink_rcv_skb+0x16b/0x440 net/netlink/af_netlink.c:2541 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x53c/0x7f0 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x8b8/0xd70 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] ____sys_sendmsg+0xaaf/0xc90 net/socket.c:2583 ___sys_sendmsg+0x135/0x1e0 net/socket.c:2637 __sys_sendmsg+0x16e/0x220 net/socket.c:2669 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fd1dcef8b79 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 c1 17 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007ffc59cc4378 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fd1dcef8b79 RDX: 0000000000040040 RSI: 0000000020000140 RDI: 0000000000000004 RBP: 00000000000113fd R08: 0000000000000006 R09: 0000000000000006 R10: 0000000000000006 R11: 0000000000000246 R12: 00007ffc59cc438c R13: 431bde82d7b634db R14: 0000000000000001 R15: 0000000000000001 Fixes: 5eb902b8e719 ("net/ipv6: Remove expired routes with a separated list of routes.") Reported-by: syzbot+1de74b0794c40c8eb300@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67461f7f.050a0220.1286eb.0021.GAE@google.com/T/#u Signed-off-by: Eric Dumazet CC: Kui-Feng Lee Cc: David Ahern Reviewed-by: David Ahern Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c489a1e6aec9..0e765466d7f7 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4821,7 +4821,7 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, ifm->ifa_prefixlen, extack); } -static int modify_prefix_route(struct inet6_ifaddr *ifp, +static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp, unsigned long expires, u32 flags, bool modify_peer) { @@ -4845,7 +4845,9 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ifp->prefix_len, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); - } else { + return 0; + } + if (f6i != net->ipv6.fib6_null_entry) { table = f6i->fib6_table; spin_lock_bh(&table->tb6_lock); @@ -4858,9 +4860,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, } spin_unlock_bh(&table->tb6_lock); - - fib6_info_release(f6i); } + fib6_info_release(f6i); return 0; } @@ -4939,7 +4940,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, int rc = -ENOENT; if (had_prefixroute) - rc = modify_prefix_route(ifp, expires, flags, false); + rc = modify_prefix_route(net, ifp, expires, flags, false); /* prefix route could have been deleted; if so restore it */ if (rc == -ENOENT) { @@ -4949,7 +4950,7 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, } if (had_prefixroute && !ipv6_addr_any(&ifp->peer_addr)) - rc = modify_prefix_route(ifp, expires, flags, true); + rc = modify_prefix_route(net, ifp, expires, flags, true); if (rc == -ENOENT && !ipv6_addr_any(&ifp->peer_addr)) { addrconf_prefix_route(&ifp->peer_addr, ifp->prefix_len, -- cgit v1.2.3 From a8c695005bfe6569acd73d777ca298ddddd66105 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Tue, 5 Nov 2024 12:48:23 +0300 Subject: can: j1939: j1939_session_new(): fix skb reference counting Since j1939_session_skb_queue() does an extra skb_get() for each new skb, do the same for the initial one in j1939_session_new() to avoid refcount underflow. Reported-by: syzbot+d4e8dc385d9258220c31@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d4e8dc385d9258220c31 Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Dmitry Antipov Tested-by: Oleksij Rempel Acked-by: Oleksij Rempel Link: https://patch.msgid.link/20241105094823.2403806-1-dmantipov@yandex.ru [mkl: clean up commit message] Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 319f47df3330..95f7a7e65a73 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1505,7 +1505,7 @@ static struct j1939_session *j1939_session_new(struct j1939_priv *priv, session->state = J1939_SESSION_NEW; skb_queue_head_init(&session->skb_queue); - skb_queue_tail(&session->skb_queue, skb); + skb_queue_tail(&session->skb_queue, skb_get(skb)); skcb = j1939_skb_to_cb(skb); memcpy(&session->skcb, skcb, sizeof(session->skcb)); -- cgit v1.2.3 From 3301ab7d5aeb0fe270f73a3d4810c9d1b6a9f045 Mon Sep 17 00:00:00 2001 From: Jiri Wiesner Date: Thu, 28 Nov 2024 09:59:50 +0100 Subject: net/ipv6: release expired exception dst cached in socket Dst objects get leaked in ip6_negative_advice() when this function is executed for an expired IPv6 route located in the exception table. There are several conditions that must be fulfilled for the leak to occur: * an ICMPv6 packet indicating a change of the MTU for the path is received, resulting in an exception dst being created * a TCP connection that uses the exception dst for routing packets must start timing out so that TCP begins retransmissions * after the exception dst expires, the FIB6 garbage collector must not run before TCP executes ip6_negative_advice() for the expired exception dst When TCP executes ip6_negative_advice() for an exception dst that has expired and if no other socket holds a reference to the exception dst, the refcount of the exception dst is 2, which corresponds to the increment made by dst_init() and the increment made by the TCP socket for which the connection is timing out. The refcount made by the socket is never released. The refcount of the dst is decremented in sk_dst_reset() but that decrement is counteracted by a dst_hold() intentionally placed just before the sk_dst_reset() in ip6_negative_advice(). After ip6_negative_advice() has finished, there is no other object tied to the dst. The socket lost its reference stored in sk_dst_cache and the dst is no longer in the exception table. The exception dst becomes a leaked object. As a result of this dst leak, an unbalanced refcount is reported for the loopback device of a net namespace being destroyed under kernels that do not contain e5f80fcf869a ("ipv6: give an IPv6 dev to blackhole_netdev"): unregister_netdevice: waiting for lo to become free. Usage count = 2 Fix the dst leak by removing the dst_hold() in ip6_negative_advice(). The patch that introduced the dst_hold() in ip6_negative_advice() was 92f1655aa2b22 ("net: fix __dst_negative_advice() race"). But 92f1655aa2b22 merely refactored the code with regards to the dst refcount so the issue was present even before 92f1655aa2b22. The bug was introduced in 54c1a859efd9f ("ipv6: Don't drop cache route entry unless timer actually expired.") where the expired cached route is deleted and the sk_dst_cache member of the socket is set to NULL by calling dst_negative_advice() but the refcount belonging to the socket is left unbalanced. The IPv4 version - ipv4_negative_advice() - is not affected by this bug. When the TCP connection times out ipv4_negative_advice() merely resets the sk_dst_cache of the socket while decrementing the refcount of the exception dst. Fixes: 92f1655aa2b22 ("net: fix __dst_negative_advice() race") Fixes: 54c1a859efd9f ("ipv6: Don't drop cache route entry unless timer actually expired.") Link: https://lore.kernel.org/netdev/20241113105611.GA6723@incl/T/#u Signed-off-by: Jiri Wiesner Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241128085950.GA4505@incl Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 63d7681c929f..67ff16c04718 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2780,10 +2780,10 @@ static void ip6_negative_advice(struct sock *sk, if (rt->rt6i_flags & RTF_CACHE) { rcu_read_lock(); if (rt6_check_expired(rt)) { - /* counteract the dst_release() in sk_dst_reset() */ - dst_hold(dst); + /* rt/dst can not be destroyed yet, + * because of rcu_read_lock() + */ sk_dst_reset(sk); - rt6_remove_exception_rt(rt); } rcu_read_unlock(); -- cgit v1.2.3 From 22be4727a8f898442066bcac34f8a1ad0bc72e14 Mon Sep 17 00:00:00 2001 From: Ivan Solodovnikov Date: Tue, 26 Nov 2024 17:39:02 +0300 Subject: dccp: Fix memory leak in dccp_feat_change_recv If dccp_feat_push_confirm() fails after new value for SP feature was accepted without reconciliation ('entry == NULL' branch), memory allocated for that value with dccp_feat_clone_sp_val() is never freed. Here is the kmemleak stack for this: unreferenced object 0xffff88801d4ab488 (size 8): comm "syz-executor310", pid 1127, jiffies 4295085598 (age 41.666s) hex dump (first 8 bytes): 01 b4 4a 1d 80 88 ff ff ..J..... backtrace: [<00000000db7cabfe>] kmemdup+0x23/0x50 mm/util.c:128 [<0000000019b38405>] kmemdup include/linux/string.h:465 [inline] [<0000000019b38405>] dccp_feat_clone_sp_val net/dccp/feat.c:371 [inline] [<0000000019b38405>] dccp_feat_clone_sp_val net/dccp/feat.c:367 [inline] [<0000000019b38405>] dccp_feat_change_recv net/dccp/feat.c:1145 [inline] [<0000000019b38405>] dccp_feat_parse_options+0x1196/0x2180 net/dccp/feat.c:1416 [<00000000b1f6d94a>] dccp_parse_options+0xa2a/0x1260 net/dccp/options.c:125 [<0000000030d7b621>] dccp_rcv_state_process+0x197/0x13d0 net/dccp/input.c:650 [<000000001f74c72e>] dccp_v4_do_rcv+0xf9/0x1a0 net/dccp/ipv4.c:688 [<00000000a6c24128>] sk_backlog_rcv include/net/sock.h:1041 [inline] [<00000000a6c24128>] __release_sock+0x139/0x3b0 net/core/sock.c:2570 [<00000000cf1f3a53>] release_sock+0x54/0x1b0 net/core/sock.c:3111 [<000000008422fa23>] inet_wait_for_connect net/ipv4/af_inet.c:603 [inline] [<000000008422fa23>] __inet_stream_connect+0x5d0/0xf70 net/ipv4/af_inet.c:696 [<0000000015b6f64d>] inet_stream_connect+0x53/0xa0 net/ipv4/af_inet.c:735 [<0000000010122488>] __sys_connect_file+0x15c/0x1a0 net/socket.c:1865 [<00000000b4b70023>] __sys_connect+0x165/0x1a0 net/socket.c:1882 [<00000000f4cb3815>] __do_sys_connect net/socket.c:1892 [inline] [<00000000f4cb3815>] __se_sys_connect net/socket.c:1889 [inline] [<00000000f4cb3815>] __x64_sys_connect+0x6e/0xb0 net/socket.c:1889 [<00000000e7b1e839>] do_syscall_64+0x33/0x40 arch/x86/entry/common.c:46 [<0000000055e91434>] entry_SYSCALL_64_after_hwframe+0x67/0xd1 Clean up the allocated memory in case of dccp_feat_push_confirm() failure and bail out with an error reset code. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: e77b8363b2ea ("dccp: Process incoming Change feature-negotiation options") Signed-off-by: Ivan Solodovnikov Link: https://patch.msgid.link/20241126143902.190853-1-solodovnikov.ia@phystech.edu Signed-off-by: Paolo Abeni --- net/dccp/feat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/dccp/feat.c b/net/dccp/feat.c index 54086bb05c42..f7554dcdaaba 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -1166,8 +1166,12 @@ static u8 dccp_feat_change_recv(struct list_head *fn, u8 is_mandatory, u8 opt, goto not_valid_or_not_known; } - return dccp_feat_push_confirm(fn, feat, local, &fval); + if (dccp_feat_push_confirm(fn, feat, local, &fval)) { + kfree(fval.sp.vec); + return DCCP_RESET_CODE_TOO_BUSY; + } + return 0; } else if (entry->state == FEAT_UNSTABLE) { /* 6.6.2 */ return 0; } -- cgit v1.2.3 From 6a2fa13312e51a621f652d522d7e2df7066330b6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 27 Nov 2024 14:05:12 +0900 Subject: tipc: Fix use-after-free of kernel socket in cleanup_bearer(). syzkaller reported a use-after-free of UDP kernel socket in cleanup_bearer() without repro. [0][1] When bearer_disable() calls tipc_udp_disable(), cleanup of the UDP kernel socket is deferred by work calling cleanup_bearer(). tipc_net_stop() waits for such works to finish by checking tipc_net(net)->wq_count. However, the work decrements the count too early before releasing the kernel socket, unblocking cleanup_net() and resulting in use-after-free. Let's move the decrement after releasing the socket in cleanup_bearer(). [0]: ref_tracker: net notrefcnt@000000009b3d1faf has 1/1 users at sk_alloc+0x438/0x608 inet_create+0x4c8/0xcb0 __sock_create+0x350/0x6b8 sock_create_kern+0x58/0x78 udp_sock_create4+0x68/0x398 udp_sock_create+0x88/0xc8 tipc_udp_enable+0x5e8/0x848 __tipc_nl_bearer_enable+0x84c/0xed8 tipc_nl_bearer_enable+0x38/0x60 genl_family_rcv_msg_doit+0x170/0x248 genl_rcv_msg+0x400/0x5b0 netlink_rcv_skb+0x1dc/0x398 genl_rcv+0x44/0x68 netlink_unicast+0x678/0x8b0 netlink_sendmsg+0x5e4/0x898 ____sys_sendmsg+0x500/0x830 [1]: BUG: KMSAN: use-after-free in udp_hashslot include/net/udp.h:85 [inline] BUG: KMSAN: use-after-free in udp_lib_unhash+0x3b8/0x930 net/ipv4/udp.c:1979 udp_hashslot include/net/udp.h:85 [inline] udp_lib_unhash+0x3b8/0x930 net/ipv4/udp.c:1979 sk_common_release+0xaf/0x3f0 net/core/sock.c:3820 inet_release+0x1e0/0x260 net/ipv4/af_inet.c:437 inet6_release+0x6f/0xd0 net/ipv6/af_inet6.c:489 __sock_release net/socket.c:658 [inline] sock_release+0xa0/0x210 net/socket.c:686 cleanup_bearer+0x42d/0x4c0 net/tipc/udp_media.c:819 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xcaf/0x1c90 kernel/workqueue.c:3310 worker_thread+0xf6c/0x1510 kernel/workqueue.c:3391 kthread+0x531/0x6b0 kernel/kthread.c:389 ret_from_fork+0x60/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 Uninit was created at: slab_free_hook mm/slub.c:2269 [inline] slab_free mm/slub.c:4580 [inline] kmem_cache_free+0x207/0xc40 mm/slub.c:4682 net_free net/core/net_namespace.c:454 [inline] cleanup_net+0x16f2/0x19d0 net/core/net_namespace.c:647 process_one_work kernel/workqueue.c:3229 [inline] process_scheduled_works+0xcaf/0x1c90 kernel/workqueue.c:3310 worker_thread+0xf6c/0x1510 kernel/workqueue.c:3391 kthread+0x531/0x6b0 kernel/kthread.c:389 ret_from_fork+0x60/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x11/0x20 arch/x86/entry/entry_64.S:244 CPU: 0 UID: 0 PID: 54 Comm: kworker/0:2 Not tainted 6.12.0-rc1-00131-gf66ebf37d69c #7 91723d6f74857f70725e1583cba3cf4adc716cfa Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014 Workqueue: events cleanup_bearer Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets.") Reported-by: syzkaller Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241127050512.28438-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/tipc/udp_media.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 439f75539977..b7e25e7e9933 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -814,10 +814,10 @@ static void cleanup_bearer(struct work_struct *work) kfree_rcu(rcast, rcu); } - atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); dst_cache_destroy(&ub->rcast.dst_cache); udp_tunnel_sock_release(ub->ubsock); synchronize_net(); + atomic_dec(&tipc_net(sock_net(ub->ubsock->sk))->wq_count); kfree(ub); } -- cgit v1.2.3 From 0541db8ee32c09463a72d0987382b3a3336b0043 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 27 Nov 2024 21:30:13 +0800 Subject: net/smc: initialize close_work early to avoid warning We encountered a warning that close_work was canceled before initialization. WARNING: CPU: 7 PID: 111103 at kernel/workqueue.c:3047 __flush_work+0x19e/0x1b0 Workqueue: events smc_lgr_terminate_work [smc] RIP: 0010:__flush_work+0x19e/0x1b0 Call Trace: ? __wake_up_common+0x7a/0x190 ? work_busy+0x80/0x80 __cancel_work_timer+0xe3/0x160 smc_close_cancel_work+0x1a/0x70 [smc] smc_close_active_abort+0x207/0x360 [smc] __smc_lgr_terminate.part.38+0xc8/0x180 [smc] process_one_work+0x19e/0x340 worker_thread+0x30/0x370 ? process_one_work+0x340/0x340 kthread+0x117/0x130 ? __kthread_cancel_work+0x50/0x50 ret_from_fork+0x22/0x30 This is because when smc_close_cancel_work is triggered, e.g. the RDMA driver is rmmod and the LGR is terminated, the conn->close_work is flushed before initialization, resulting in WARN_ON(!work->func). __smc_lgr_terminate | smc_connect_{rdma|ism} ------------------------------------------------------------- | smc_conn_create | \- smc_lgr_register_conn for conn in lgr->conns_all | \- smc_conn_kill | \- smc_close_active_abort | \- smc_close_cancel_work | \- cancel_work_sync | \- __flush_work | (close_work) | | smc_close_init | \- INIT_WORK(&close_work) So fix this by initializing close_work before establishing the connection. Fixes: 46c28dbd4c23 ("net/smc: no socket state changes in tasklet context") Fixes: 413498440e30 ("net/smc: add SMC-D support in af_smc") Signed-off-by: Wen Gu Reviewed-by: Wenjia Zhang Reviewed-by: Alexandra Winter Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9d76e902fd77..ed6d4d520bc7 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -383,6 +383,7 @@ void smc_sk_init(struct net *net, struct sock *sk, int protocol) smc->limit_smc_hs = net->smc.limit_smc_hs; smc->use_fallback = false; /* assume rdma capability first */ smc->fallback_rsn = 0; + smc_close_init(smc); } static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, @@ -1299,7 +1300,6 @@ static int smc_connect_rdma(struct smc_sock *smc, goto connect_abort; } - smc_close_init(smc); smc_rx_init(smc); if (ini->first_contact_local) { @@ -1435,7 +1435,6 @@ static int smc_connect_ism(struct smc_sock *smc, goto connect_abort; } } - smc_close_init(smc); smc_rx_init(smc); smc_tx_init(smc); @@ -2479,7 +2478,6 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; mutex_lock(&smc_server_lgr_pending); - smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); -- cgit v1.2.3 From 2c7f14ed9c19ec0f149479d1c2842ec1f9bf76d7 Mon Sep 17 00:00:00 2001 From: Wen Gu Date: Wed, 27 Nov 2024 21:30:14 +0800 Subject: net/smc: fix LGR and link use-after-free issue We encountered a LGR/link use-after-free issue, which manifested as the LGR/link refcnt reaching 0 early and entering the clear process, making resource access unsafe. refcount_t: addition on 0; use-after-free. WARNING: CPU: 14 PID: 107447 at lib/refcount.c:25 refcount_warn_saturate+0x9c/0x140 Workqueue: events smc_lgr_terminate_work [smc] Call trace: refcount_warn_saturate+0x9c/0x140 __smc_lgr_terminate.part.45+0x2a8/0x370 [smc] smc_lgr_terminate_work+0x28/0x30 [smc] process_one_work+0x1b8/0x420 worker_thread+0x158/0x510 kthread+0x114/0x118 or refcount_t: underflow; use-after-free. WARNING: CPU: 6 PID: 93140 at lib/refcount.c:28 refcount_warn_saturate+0xf0/0x140 Workqueue: smc_hs_wq smc_listen_work [smc] Call trace: refcount_warn_saturate+0xf0/0x140 smcr_link_put+0x1cc/0x1d8 [smc] smc_conn_free+0x110/0x1b0 [smc] smc_conn_abort+0x50/0x60 [smc] smc_listen_find_device+0x75c/0x790 [smc] smc_listen_work+0x368/0x8a0 [smc] process_one_work+0x1b8/0x420 worker_thread+0x158/0x510 kthread+0x114/0x118 It is caused by repeated release of LGR/link refcnt. One suspect is that smc_conn_free() is called repeatedly because some smc_conn_free() from server listening path are not protected by sock lock. e.g. Calls under socklock | smc_listen_work ------------------------------------------------------- lock_sock(sk) | smc_conn_abort smc_conn_free | \- smc_conn_free \- smcr_link_put | \- smcr_link_put (duplicated) release_sock(sk) So here add sock lock protection in smc_listen_work() path, making it exclusive with other connection operations. Fixes: 3b2dec2603d5 ("net/smc: restructure client and server code in af_smc") Co-developed-by: Guangguan Wang Signed-off-by: Guangguan Wang Co-developed-by: Kai Signed-off-by: Kai Signed-off-by: Wen Gu Reviewed-by: Wenjia Zhang Signed-off-by: Paolo Abeni --- net/smc/af_smc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index ed6d4d520bc7..9e6c69d18581 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1900,6 +1900,7 @@ static void smc_listen_out(struct smc_sock *new_smc) if (tcp_sk(new_smc->clcsock->sk)->syn_smc) atomic_dec(&lsmc->queued_smc_hs); + release_sock(newsmcsk); /* lock in smc_listen_work() */ if (lsmc->sk.sk_state == SMC_LISTEN) { lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING); smc_accept_enqueue(&lsmc->sk, newsmcsk); @@ -2421,6 +2422,7 @@ static void smc_listen_work(struct work_struct *work) u8 accept_version; int rc = 0; + lock_sock(&new_smc->sk); /* release in smc_listen_out() */ if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN) return smc_listen_out_err(new_smc); -- cgit v1.2.3 From 48327566769a6ff2e873b6bf075392bd756625ca Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 29 Nov 2024 13:25:19 -0800 Subject: rtnetlink: fix double call of rtnl_link_get_net_ifla() Currently rtnl_link_get_net_ifla() gets called twice when we create peer devices, once in rtnl_add_peer_net() and once in each ->newlink() implementation. This looks safer, however, it leads to a classic Time-of-Check to Time-of-Use (TOCTOU) bug since IFLA_NET_NS_PID is very dynamic. And because of the lack of checking error pointer of the second call, it also leads to a kernel crash as reported by syzbot. Fix this by getting rid of the second call, which already becomes redudant after Kuniyuki's work. We have to propagate the result of the first rtnl_link_get_net_ifla() down to each ->newlink(). Reported-by: syzbot+21ba4d5adff0b6a7cfc6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=21ba4d5adff0b6a7cfc6 Fixes: 0eb87b02a705 ("veth: Set VETH_INFO_PEER to veth_link_ops.peer_type.") Fixes: 6b84e558e95d ("vxcan: Set VXCAN_INFO_PEER to vxcan_link_ops.peer_type.") Fixes: fefd5d082172 ("netkit: Set IFLA_NETKIT_PEER_INFO to netkit_link_ops.peer_type.") Cc: Kuniyuki Iwashima Signed-off-by: Cong Wang Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241129212519.825567-1-xiyou.wangcong@gmail.com Signed-off-by: Paolo Abeni --- net/core/rtnetlink.c | 44 +++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 58df76fe408a..ab5f201bf0ab 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3746,6 +3746,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, + struct net *peer_net, const struct nlmsghdr *nlh, struct nlattr **tb, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3776,8 +3777,13 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm, dev->ifindex = ifm->ifi_index; + if (link_net) + net = link_net; + if (peer_net) + net = peer_net; + if (ops->newlink) - err = ops->newlink(link_net ? : net, dev, tb, data, extack); + err = ops->newlink(net, dev, tb, data, extack); else err = register_netdevice(dev); if (err < 0) { @@ -3812,40 +3818,33 @@ out_unregister: goto out; } -static int rtnl_add_peer_net(struct rtnl_nets *rtnl_nets, - const struct rtnl_link_ops *ops, - struct nlattr *data[], - struct netlink_ext_ack *extack) +static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *data[], + struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX + 1]; - struct net *net; int err; if (!data || !data[ops->peer_type]) - return 0; + return NULL; err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) - return err; + return ERR_PTR(err); if (ops->validate) { err = ops->validate(tb, NULL, extack); if (err < 0) - return err; + return ERR_PTR(err); } - net = rtnl_link_get_net_ifla(tb); - if (IS_ERR(net)) - return PTR_ERR(net); - if (net) - rtnl_nets_add(rtnl_nets, net); - - return 0; + return rtnl_link_get_net_ifla(tb); } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, const struct rtnl_link_ops *ops, struct net *tgt_net, struct net *link_net, + struct net *peer_net, struct rtnl_newlink_tbs *tbs, struct nlattr **data, struct netlink_ext_ack *extack) @@ -3894,14 +3893,15 @@ static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, return -EOPNOTSUPP; } - return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, nlh, tb, data, extack); + return rtnl_newlink_create(skb, ifm, ops, tgt_net, link_net, peer_net, nlh, + tb, data, extack); } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct net *tgt_net, *link_net = NULL, *peer_net = NULL; struct nlattr **tb, **linkinfo, **data = NULL; - struct net *tgt_net, *link_net = NULL; struct rtnl_link_ops *ops = NULL; struct rtnl_newlink_tbs *tbs; struct rtnl_nets rtnl_nets; @@ -3971,9 +3971,11 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - ret = rtnl_add_peer_net(&rtnl_nets, ops, data, extack); - if (ret < 0) + peer_net = rtnl_get_peer_net(ops, data, extack); + if (IS_ERR(peer_net)) goto put_ops; + if (peer_net) + rtnl_nets_add(&rtnl_nets, peer_net); } } @@ -4004,7 +4006,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } rtnl_nets_lock(&rtnl_nets); - ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack); + ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, peer_net, tbs, data, extack); rtnl_nets_unlock(&rtnl_nets); put_net: -- cgit v1.2.3 From af8edaeddbc52e53207d859c912b017fd9a77629 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Dec 2024 10:05:58 +0000 Subject: net: hsr: must allocate more bytes for RedBox support Blamed commit forgot to change hsr_init_skb() to allocate larger skb for RedBox case. Indeed, send_hsr_supervision_frame() will add two additional components (struct hsr_sup_tlv and struct hsr_sup_payload) syzbot reported the following crash: skbuff: skb_over_panic: text:ffffffff8afd4b0a len:34 put:6 head:ffff88802ad29e00 data:ffff88802ad29f22 tail:0x144 end:0x140 dev:gretap0 ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 2 UID: 0 PID: 7611 Comm: syz-executor Not tainted 6.12.0-syzkaller #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:skb_panic+0x157/0x1d0 net/core/skbuff.c:206 Code: b6 04 01 84 c0 74 04 3c 03 7e 21 8b 4b 70 41 56 45 89 e8 48 c7 c7 a0 7d 9b 8c 41 57 56 48 89 ee 52 4c 89 e2 e8 9a 76 79 f8 90 <0f> 0b 4c 89 4c 24 10 48 89 54 24 08 48 89 34 24 e8 94 76 fb f8 4c RSP: 0018:ffffc90000858ab8 EFLAGS: 00010282 RAX: 0000000000000087 RBX: ffff8880598c08c0 RCX: ffffffff816d3e69 RDX: 0000000000000000 RSI: ffffffff816de786 RDI: 0000000000000005 RBP: ffffffff8c9b91c0 R08: 0000000000000005 R09: 0000000000000000 R10: 0000000000000302 R11: ffffffff961cc1d0 R12: ffffffff8afd4b0a R13: 0000000000000006 R14: ffff88804b938130 R15: 0000000000000140 FS: 000055558a3d6500(0000) GS:ffff88806a800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f1295974ff8 CR3: 000000002ab6e000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_over_panic net/core/skbuff.c:211 [inline] skb_put+0x174/0x1b0 net/core/skbuff.c:2617 send_hsr_supervision_frame+0x6fa/0x9e0 net/hsr/hsr_device.c:342 hsr_proxy_announce+0x1a3/0x4a0 net/hsr/hsr_device.c:436 call_timer_fn+0x1a0/0x610 kernel/time/timer.c:1794 expire_timers kernel/time/timer.c:1845 [inline] __run_timers+0x6e8/0x930 kernel/time/timer.c:2419 __run_timer_base kernel/time/timer.c:2430 [inline] __run_timer_base kernel/time/timer.c:2423 [inline] run_timer_base+0x111/0x190 kernel/time/timer.c:2439 run_timer_softirq+0x1a/0x40 kernel/time/timer.c:2449 handle_softirqs+0x213/0x8f0 kernel/softirq.c:554 __do_softirq kernel/softirq.c:588 [inline] invoke_softirq kernel/softirq.c:428 [inline] __irq_exit_rcu kernel/softirq.c:637 [inline] irq_exit_rcu+0xbb/0x120 kernel/softirq.c:649 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa4/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 5055cccfc2d1 ("net: hsr: Provide RedBox support (HSR-SAN)") Reported-by: syzbot+7f4643b267cc680bfa1c@syzkaller.appspotmail.com Signed-off-by: Eric Dumazet Cc: Lukasz Majewski Link: https://patch.msgid.link/20241202100558.507765-1-edumazet@google.com Signed-off-by: Paolo Abeni --- net/hsr/hsr_device.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 31a416ee21ad..03eadd6c51fd 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -246,20 +246,22 @@ static const struct header_ops hsr_header_ops = { .parse = eth_header_parse, }; -static struct sk_buff *hsr_init_skb(struct hsr_port *master) +static struct sk_buff *hsr_init_skb(struct hsr_port *master, int extra) { struct hsr_priv *hsr = master->hsr; struct sk_buff *skb; int hlen, tlen; + int len; hlen = LL_RESERVED_SPACE(master->dev); tlen = master->dev->needed_tailroom; + len = sizeof(struct hsr_sup_tag) + sizeof(struct hsr_sup_payload); /* skb size is same for PRP/HSR frames, only difference * being, for PRP it is a trailer and for HSR it is a - * header + * header. + * RedBox might use @extra more bytes. */ - skb = dev_alloc_skb(sizeof(struct hsr_sup_tag) + - sizeof(struct hsr_sup_payload) + hlen + tlen); + skb = dev_alloc_skb(len + extra + hlen + tlen); if (!skb) return skb; @@ -295,6 +297,7 @@ static void send_hsr_supervision_frame(struct hsr_port *port, struct hsr_sup_tlv *hsr_stlv; struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; + int extra = 0; *interval = msecs_to_jiffies(HSR_LIFE_CHECK_INTERVAL); if (hsr->announce_count < 3 && hsr->prot_version == 0) { @@ -303,7 +306,11 @@ static void send_hsr_supervision_frame(struct hsr_port *port, hsr->announce_count++; } - skb = hsr_init_skb(port); + if (hsr->redbox) + extra = sizeof(struct hsr_sup_tlv) + + sizeof(struct hsr_sup_payload); + + skb = hsr_init_skb(port, extra); if (!skb) { netdev_warn_once(port->dev, "HSR: Could not send supervision frame\n"); return; @@ -362,7 +369,7 @@ static void send_prp_supervision_frame(struct hsr_port *master, struct hsr_sup_tag *hsr_stag; struct sk_buff *skb; - skb = hsr_init_skb(master); + skb = hsr_init_skb(master, 0); if (!skb) { netdev_warn_once(master->dev, "PRP: Could not send supervision frame\n"); return; -- cgit v1.2.3 From 7b1d83da254be3bf054965c8f3b1ad976f460ae5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 27 Nov 2024 12:46:54 +0100 Subject: netfilter: nft_inner: incorrect percpu area handling under softirq Softirq can interrupt ongoing packet from process context that is walking over the percpu area that contains inner header offsets. Disable bh and perform three checks before restoring the percpu inner header offsets to validate that the percpu area is valid for this skbuff: 1) If the NFT_PKTINFO_INNER_FULL flag is set on, then this skbuff has already been parsed before for inner header fetching to register. 2) Validate that the percpu area refers to this skbuff using the skbuff pointer as a cookie. If there is a cookie mismatch, then this skbuff needs to be parsed again. 3) Finally, validate if the percpu area refers to this tunnel type. Only after these three checks the percpu area is restored to a on-stack copy and bh is enabled again. After inner header fetching, the on-stack copy is stored back to the percpu area. Fixes: 3a07327d10a0 ("netfilter: nft_inner: support for inner tunnel header matching") Reported-by: syzbot+84d0441b9860f0d63285@syzkaller.appspotmail.com Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_inner.c | 57 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c index 928312d01eb1..817ab978d24a 100644 --- a/net/netfilter/nft_inner.c +++ b/net/netfilter/nft_inner.c @@ -210,35 +210,66 @@ static int nft_inner_parse(const struct nft_inner *priv, struct nft_pktinfo *pkt, struct nft_inner_tun_ctx *tun_ctx) { - struct nft_inner_tun_ctx ctx = {}; u32 off = pkt->inneroff; if (priv->flags & NFT_INNER_HDRSIZE && - nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0) + nft_inner_parse_tunhdr(priv, pkt, tun_ctx, &off) < 0) return -1; if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { - if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0) + if (nft_inner_parse_l2l3(priv, pkt, tun_ctx, off) < 0) return -1; } else if (priv->flags & NFT_INNER_TH) { - ctx.inner_thoff = off; - ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH; + tun_ctx->inner_thoff = off; + tun_ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; } - *tun_ctx = ctx; tun_ctx->type = priv->type; + tun_ctx->cookie = (unsigned long)pkt->skb; pkt->flags |= NFT_PKTINFO_INNER_FULL; return 0; } +static bool nft_inner_restore_tun_ctx(const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx *this_cpu_tun_ctx; + + local_bh_disable(); + this_cpu_tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + if (this_cpu_tun_ctx->cookie != (unsigned long)pkt->skb) { + local_bh_enable(); + return false; + } + *tun_ctx = *this_cpu_tun_ctx; + local_bh_enable(); + +