From eb09fbeb48709fe66c0d708aed81e910a577a30a Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Wed, 13 Nov 2024 17:51:29 +0800 Subject: mac802154: check local interfaces before deleting sdata list syzkaller reported a corrupted list in ieee802154_if_remove. [1] Remove an IEEE 802.15.4 network interface after unregister an IEEE 802.15.4 hardware device from the system. CPU0 CPU1 ==== ==== genl_family_rcv_msg_doit ieee802154_unregister_hw ieee802154_del_iface ieee802154_remove_interfaces rdev_del_virtual_intf_deprecated list_del(&sdata->list) ieee802154_if_remove list_del_rcu The net device has been unregistered, since the rcu grace period, unregistration must be run before ieee802154_if_remove. To avoid this issue, add a check for local->interfaces before deleting sdata list. [1] kernel BUG at lib/list_debug.c:58! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 6277 Comm: syz-executor157 Not tainted 6.12.0-rc6-syzkaller-00005-g557329bcecc2 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__list_del_entry_valid_or_report+0xf4/0x140 lib/list_debug.c:56 Code: e8 a1 7e 00 07 90 0f 0b 48 c7 c7 e0 37 60 8c 4c 89 fe e8 8f 7e 00 07 90 0f 0b 48 c7 c7 40 38 60 8c 4c 89 fe e8 7d 7e 00 07 90 <0f> 0b 48 c7 c7 a0 38 60 8c 4c 89 fe e8 6b 7e 00 07 90 0f 0b 48 c7 RSP: 0018:ffffc9000490f3d0 EFLAGS: 00010246 RAX: 000000000000004e RBX: dead000000000122 RCX: d211eee56bb28d00 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88805b278dd8 R08: ffffffff8174a12c R09: 1ffffffff2852f0d R10: dffffc0000000000 R11: fffffbfff2852f0e R12: dffffc0000000000 R13: dffffc0000000000 R14: dead000000000100 R15: ffff88805b278cc0 FS: 0000555572f94380(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056262e4a3000 CR3: 0000000078496000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __list_del_entry_valid include/linux/list.h:124 [inline] __list_del_entry include/linux/list.h:215 [inline] list_del_rcu include/linux/rculist.h:157 [inline] ieee802154_if_remove+0x86/0x1e0 net/mac802154/iface.c:687 rdev_del_virtual_intf_deprecated net/ieee802154/rdev-ops.h:24 [inline] ieee802154_del_iface+0x2c0/0x5c0 net/ieee802154/nl-phy.c:323 genl_family_rcv_msg_doit net/netlink/genetlink.c:1115 [inline] genl_family_rcv_msg net/netlink/genetlink.c:1195 [inline] genl_rcv_msg+0xb14/0xec0 net/netlink/genetlink.c:1210 netlink_rcv_skb+0x1e3/0x430 net/netlink/af_netlink.c:2551 genl_rcv+0x28/0x40 net/netlink/genetlink.c:1219 netlink_unicast_kernel net/netlink/af_netlink.c:1331 [inline] netlink_unicast+0x7f6/0x990 net/netlink/af_netlink.c:1357 netlink_sendmsg+0x8e4/0xcb0 net/netlink/af_netlink.c:1901 sock_sendmsg_nosec net/socket.c:729 [inline] __sock_sendmsg+0x221/0x270 net/socket.c:744 ____sys_sendmsg+0x52a/0x7e0 net/socket.c:2607 ___sys_sendmsg net/socket.c:2661 [inline] __sys_sendmsg+0x292/0x380 net/socket.c:2690 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Reported-and-tested-by: syzbot+985f827280dc3a6e7e92@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=985f827280dc3a6e7e92 Signed-off-by: Lizhi Xu Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/20241113095129.1457225-1-lizhi.xu@windriver.com Signed-off-by: Stefan Schmidt --- net/mac802154/iface.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index c0e2da5072be..9e4631fade90 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -684,6 +684,10 @@ void ieee802154_if_remove(struct ieee802154_sub_if_data *sdata) ASSERT_RTNL(); mutex_lock(&sdata->local->iflist_mtx); + if (list_empty(&sdata->local->interfaces)) { + mutex_unlock(&sdata->local->iflist_mtx); + return; + } list_del_rcu(&sdata->list); mutex_unlock(&sdata->local->iflist_mtx); -- cgit v1.2.3 From 2b33eb8f1b3e8c2f87cfdbc8cc117f6bdfabc6ec Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:16 +0800 Subject: net/smc: protect link down work from execute after lgr freed link down work may be scheduled before lgr freed but execute after lgr freed, which may result in crash. So it is need to hold a reference before shedule link down work, and put the reference after work executed or canceled. The relevant crash call stack as follows: list_del corruption. prev->next should be ffffb638c9c0fe20, but was 0000000000000000 ------------[ cut here ]------------ kernel BUG at lib/list_debug.c:51! invalid opcode: 0000 [#1] SMP NOPTI CPU: 6 PID: 978112 Comm: kworker/6:119 Kdump: loaded Tainted: G #1 Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 2221b89 04/01/2014 Workqueue: events smc_link_down_work [smc] RIP: 0010:__list_del_entry_valid.cold+0x31/0x47 RSP: 0018:ffffb638c9c0fdd8 EFLAGS: 00010086 RAX: 0000000000000054 RBX: ffff942fb75e5128 RCX: 0000000000000000 RDX: ffff943520930aa0 RSI: ffff94352091fc80 RDI: ffff94352091fc80 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffb638c9c0fc38 R10: ffffb638c9c0fc30 R11: ffffffffa015eb28 R12: 0000000000000002 R13: ffffb638c9c0fe20 R14: 0000000000000001 R15: ffff942f9cd051c0 FS: 0000000000000000(0000) GS:ffff943520900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f4f25214000 CR3: 000000025fbae004 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: rwsem_down_write_slowpath+0x17e/0x470 smc_link_down_work+0x3c/0x60 [smc] process_one_work+0x1ac/0x350 worker_thread+0x49/0x2f0 ? rescuer_thread+0x360/0x360 kthread+0x118/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x1f/0x30 Fixes: 541afa10c126 ("net/smc: add smcr_port_err() and smcr_link_down() processing") Signed-off-by: Guangguan Wang Reviewed-by: Tony Lu Signed-off-by: David S. Miller --- net/smc/smc_core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 500952c2e67b..3b125d348b4a 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1818,7 +1818,9 @@ void smcr_link_down_cond_sched(struct smc_link *lnk) { if (smc_link_downing(&lnk->state)) { trace_smcr_link_down(lnk, __builtin_return_address(0)); - schedule_work(&lnk->link_down_wrk); + smcr_link_hold(lnk); /* smcr_link_put in link_down_wrk */ + if (!schedule_work(&lnk->link_down_wrk)) + smcr_link_put(lnk); } } @@ -1850,11 +1852,14 @@ static void smc_link_down_work(struct work_struct *work) struct smc_link_group *lgr = link->lgr; if (list_empty(&lgr->list)) - return; + goto out; wake_up_all(&lgr->llc_msg_waiter); down_write(&lgr->llc_conf_mutex); smcr_link_down(link); up_write(&lgr->llc_conf_mutex); + +out: + smcr_link_put(link); /* smcr_link_hold by schedulers of link_down_work */ } static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, -- cgit v1.2.3 From 679e9ddcf90dbdf98aaaa71a492454654b627bcb Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:17 +0800 Subject: net/smc: check sndbuf_space again after NOSPACE flag is set in smc_poll When application sending data more than sndbuf_space, there have chances application will sleep in epoll_wait, and will never be wakeup again. This is caused by a race between smc_poll and smc_cdc_tx_handler. application tasklet smc_tx_sendmsg(len > sndbuf_space) | epoll_wait for EPOLL_OUT,timeout=0 | smc_poll | if (!smc->conn.sndbuf_space) | | smc_cdc_tx_handler | atomic_add sndbuf_space | smc_tx_sndbuf_nonfull | if (!test_bit SOCK_NOSPACE) | do not sk_write_space; set_bit SOCK_NOSPACE; | return mask=0; | Application will sleep in epoll_wait as smc_poll returns 0. And smc_cdc_tx_handler will not call sk_write_space because the SOCK_NOSPACE has not be set. If there is no inflight cdc msg, sk_write_space will not be called any more, and application will sleep in epoll_wait forever. So check sndbuf_space again after NOSPACE flag is set to break the race. Fixes: 8dce2786a290 ("net/smc: smc_poll improvements") Signed-off-by: Guangguan Wang Suggested-by: Paolo Abeni Signed-off-by: David S. Miller --- net/smc/af_smc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9e6c69d18581..92448f2c362c 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2881,6 +2881,13 @@ __poll_t smc_poll(struct file *file, struct socket *sock, } else { sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + if (sk->sk_state != SMC_INIT) { + /* Race breaker the same way as tcp_poll(). */ + smp_mb__after_atomic(); + if (atomic_read(&smc->conn.sndbuf_space)) + mask |= EPOLLOUT | EPOLLWRNORM; + } } if (atomic_read(&smc->conn.bytes_to_rcv)) mask |= EPOLLIN | EPOLLRDNORM; -- cgit v1.2.3 From a29e220d3c8edbf0e1beb0f028878a4a85966556 Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:18 +0800 Subject: net/smc: check iparea_offset and ipv6_prefixes_cnt when receiving proposal msg When receiving proposal msg in server, the field iparea_offset and the field ipv6_prefixes_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field iparea_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks iparea_offset and ipv6_prefixes_cnt before using them. Fixes: e7b7a64a8493 ("smc: support variable CLC proposal messages") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 6 +++++- net/smc/smc_clc.c | 4 ++++ net/smc/smc_clc.h | 6 +++++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 92448f2c362c..9a74c9693f09 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2032,6 +2032,8 @@ static int smc_listen_prfx_check(struct smc_sock *new_smc, if (pclc->hdr.typev1 == SMC_TYPE_N) return 0; pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx) + return -EPROTO; if (smc_clc_prfx_match(newclcsock, pclc_prfx)) return SMC_CLC_DECL_DIFFPREFIX; @@ -2221,7 +2223,9 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, int rc = 0; /* check if ISM V1 is available */ - if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1)) + if (!(ini->smcd_version & SMC_V1) || + !smcd_indicated(ini->smc_type_v1) || + !pclc_smcd) goto not_found; ini->is_smcd = true; /* prepare ISM check */ ini->ism_peer_gid[0].gid = ntohll(pclc_smcd->ism.gid); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 33fa787c28eb..66a43b97eede 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -354,6 +354,10 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); + if (!pclc_prfx || + pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) + return false; + if (hdr->version == SMC_V1) { if (hdr->typev1 == SMC_TYPE_N) return false; diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 5fd6f5b8ef03..ac8de6a177fa 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -336,8 +336,12 @@ struct smc_clc_msg_decline_v2 { /* clc decline message */ static inline struct smc_clc_msg_proposal_prefix * smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc) { + u16 offset = ntohs(pclc->iparea_offset); + + if (offset > sizeof(struct smc_clc_msg_smcd)) + return NULL; return (struct smc_clc_msg_proposal_prefix *) - ((u8 *)pclc + sizeof(*pclc) + ntohs(pclc->iparea_offset)); + ((u8 *)pclc + sizeof(*pclc) + offset); } static inline bool smcr_indicated(int smc_type) -- cgit v1.2.3 From 7863c9f3d24ba49dbead7e03dfbe40deb5888fdf Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:19 +0800 Subject: net/smc: check v2_ext_offset/eid_cnt/ism_gid_cnt when receiving proposal msg When receiving proposal msg in server, the fields v2_ext_offset/ eid_cnt/ism_gid_cnt in proposal msg are from the remote client and can not be fully trusted. Especially the field v2_ext_offset, once exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the fields v2_ext_offset/eid_cnt/ism_gid_cnt before using them. Fixes: 8c3dca341aea ("net/smc: build and send V2 CLC proposal") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 3 ++- net/smc/smc_clc.c | 8 +++++++- net/smc/smc_clc.h | 8 +++++++- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9a74c9693f09..5d96f9de5b5d 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2276,7 +2276,8 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, goto not_found; smc_v2_ext = smc_get_clc_v2_ext(pclc); - if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) + if (!smc_v2_ext || + !smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL)) goto not_found; /* prepare RDMA check */ diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 66a43b97eede..f721d03efcbd 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -352,7 +352,6 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) struct smc_clc_msg_hdr *hdr = &pclc->hdr; struct smc_clc_v2_extension *v2_ext; - v2_ext = smc_get_clc_v2_ext(pclc); pclc_prfx = smc_clc_proposal_get_prefix(pclc); if (!pclc_prfx || pclc_prfx->ipv6_prefixes_cnt > SMC_CLC_MAX_V6_PREFIX) @@ -369,6 +368,13 @@ static bool smc_clc_msg_prop_valid(struct smc_clc_msg_proposal *pclc) sizeof(struct smc_clc_msg_trail)) return false; } else { + v2_ext = smc_get_clc_v2_ext(pclc); + if ((hdr->typev2 != SMC_TYPE_N && + (!v2_ext || v2_ext->hdr.eid_cnt > SMC_CLC_MAX_UEID)) || + (smcd_indicated(hdr->typev2) && + v2_ext->hdr.ism_gid_cnt > SMCD_CLC_MAX_V2_GID_ENTRIES)) + return false; + if (ntohs(hdr->length) != sizeof(*pclc) + sizeof(struct smc_clc_msg_smcd) + diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index ac8de6a177fa..23afa4df862e 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -380,8 +380,14 @@ static inline struct smc_clc_v2_extension * smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) { struct smc_clc_msg_smcd *prop_smcd = smc_get_clc_msg_smcd(prop); + u16 max_offset; - if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset)) + max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_smcd) - + offsetofend(struct smc_clc_msg_smcd, v2_ext_offset); + + if (!prop_smcd || !ntohs(prop_smcd->v2_ext_offset) || + ntohs(prop_smcd->v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_v2_extension *) -- cgit v1.2.3 From 9ab332deb671d8f7e66d82a2ff2b3f715bc3a4ad Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:20 +0800 Subject: net/smc: check smcd_v2_ext_offset when receiving proposal msg When receiving proposal msg in server, the field smcd_v2_ext_offset in proposal msg is from the remote client and can not be fully trusted. Once the value of smcd_v2_ext_offset exceed the max value, there has the chance to access wrong address, and crash may happen. This patch checks the value of smcd_v2_ext_offset before using it. Fixes: 5c21c4ccafe8 ("net/smc: determine accepted ISM devices") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/af_smc.c | 2 ++ net/smc/smc_clc.h | 8 +++++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5d96f9de5b5d..6cc7b846cff1 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -2147,6 +2147,8 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, pclc_smcd = smc_get_clc_msg_smcd(pclc); smc_v2_ext = smc_get_clc_v2_ext(pclc); smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext); + if (!pclc_smcd || !smc_v2_ext || !smcd_v2_ext) + goto not_found; mutex_lock(&smcd_dev_list.mutex); if (pclc_smcd->ism.chid) { diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index 23afa4df862e..767289925410 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -400,9 +400,15 @@ smc_get_clc_v2_ext(struct smc_clc_msg_proposal *prop) static inline struct smc_clc_smcd_v2_extension * smc_get_clc_smcd_v2_ext(struct smc_clc_v2_extension *prop_v2ext) { + u16 max_offset = offsetof(struct smc_clc_msg_proposal_area, pclc_smcd_v2_ext) - + offsetof(struct smc_clc_msg_proposal_area, pclc_v2_ext) - + offsetof(struct smc_clc_v2_extension, hdr) - + offsetofend(struct smc_clnt_opts_area_hdr, smcd_v2_ext_offset); + if (!prop_v2ext) return NULL; - if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset)) + if (!ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) || + ntohs(prop_v2ext->hdr.smcd_v2_ext_offset) > max_offset) return NULL; return (struct smc_clc_smcd_v2_extension *) -- cgit v1.2.3 From c5b8ee5022a19464783058dc6042e8eefa34e8cd Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Wed, 11 Dec 2024 17:21:21 +0800 Subject: net/smc: check return value of sock_recvmsg when draining clc data When receiving clc msg, the field length in smc_clc_msg_hdr indicates the length of msg should be received from network and the value should not be fully trusted as it is from the network. Once the value of length exceeds the value of buflen in function smc_clc_wait_msg it may run into deadloop when trying to drain the remaining data exceeding buflen. This patch checks the return value of sock_recvmsg when draining data in case of deadloop in draining. Fixes: fb4f79264c0f ("net/smc: tolerate future SMCD versions") Signed-off-by: Guangguan Wang Reviewed-by: Wen Gu Reviewed-by: D. Wythe Signed-off-by: David S. Miller --- net/smc/smc_clc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index f721d03efcbd..521f5df80e10 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -774,6 +774,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, SMC_CLC_RECV_BUF_LEN : datlen; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, recvlen); len = sock_recvmsg(smc->clcsock, &msg, krflags); + if (len < recvlen) { + smc->sk.sk_err = EPROTO; + reason_code = -EPROTO; + goto out; + } datlen -= len; } if (clcm->type == SMC_CLC_DECLINE) { -- cgit v1.2.3 From 18d44c5d062b97b97bb0162d9742440518958dc1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 7 Dec 2024 17:33:25 +0100 Subject: ceph: allocate sparse_ext map only for sparse reads If mounted with sparseread option, ceph_direct_read_write() ends up making an unnecessarily allocation for O_DIRECT writes. Fixes: 03bc06c7b0bd ("ceph: add new mount option to enable sparse reads") Signed-off-by: Ilya Dryomov Reviewed-by: Alex Markuze --- fs/ceph/file.c | 2 +- net/ceph/osd_client.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 67468d88f139..851d70200c6b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1552,7 +1552,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, } op = &req->r_ops[0]; - if (sparse) { + if (!write && sparse) { extent_cnt = __ceph_sparse_read_ext_count(inode, size); ret = ceph_alloc_sparse_ext_map(op, extent_cnt); if (ret) { diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 9b1168eb77ab..b24afec24138 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1173,6 +1173,8 @@ EXPORT_SYMBOL(ceph_osdc_new_request); int __ceph_alloc_sparse_ext_map(struct ceph_osd_req_op *op, int cnt) { + WARN_ON(op->op != CEPH_OSD_OP_SPARSE_READ); + op->extent.sparse_ext_cnt = cnt; op->extent.sparse_ext = kmalloc_array(cnt, sizeof(*op->extent.sparse_ext), -- cgit v1.2.3 From b1f3a2f5a742c1e939a73031bd31b9e557a2d77d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:40 -0800 Subject: netdev: fix repeated netlink messages in queue dump The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 102 != 100 # Check| At /root/ksft-net-drv/drivers/net/./queues.py, line 32, in get_queues: # Check| ksft_eq(queues, expected) # Check failed 101 != 100 not ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:1 fail:1 xfail:0 xpass:0 skip:0 error:0 not ok 1 selftests: drivers/net: queues.py # exit=1 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:queues.py timeout set to 45 selftests: drivers/net: queues.py KTAP version 1 1..2 ok 1 queues.get_queues ok 2 queues.addremove_queues # Totals: pass:2 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9527dd46e4dc..9f086b190619 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -488,24 +488,21 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp, struct netdev_nl_dump_ctx *ctx) { int err = 0; - int i; if (!(netdev->flags & IFF_UP)) return err; - for (i = ctx->rxq_idx; i < netdev->real_num_rx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->rxq_idx, NETDEV_QUEUE_TYPE_RX, info); if (err) return err; - ctx->rxq_idx = i++; } - for (i = ctx->txq_idx; i < netdev->real_num_tx_queues;) { - err = netdev_nl_queue_fill_one(rsp, netdev, i, + for (; ctx->txq_idx < netdev->real_num_tx_queues; ctx->txq_idx++) { + err = netdev_nl_queue_fill_one(rsp, netdev, ctx->txq_idx, NETDEV_QUEUE_TYPE_TX, info); if (err) return err; - ctx->txq_idx = i++; } return err; -- cgit v1.2.3 From ecc391a541573da46b7ccc188105efedd40aef1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 13 Dec 2024 07:22:41 -0800 Subject: netdev: fix repeated netlink messages in queue stats The context is supposed to record the next queue to dump, not last dumped. If the dump doesn't fit we will restart from the already-dumped queue, duplicating the message. Before this fix and with the selftest improvements later in this series we see: # ./run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 45 != 44 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 45 != 44 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 103 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 103 != 100 missing queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 125, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), len(set(queues[qtype])), # Check failed 102 != 100 repeated queue keys # Check| At /root/ksft-net-drv/drivers/net/./stats.py, line 127, in qstat_by_ifindex: # Check| ksft_eq(len(queues[qtype]), max(queues[qtype]) + 1, # Check failed 102 != 100 missing queue keys not ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:4 fail:1 xfail:0 xpass:0 skip:0 error:0 With the fix: # ./ksft-net-drv/run_kselftest.sh -t drivers/net:stats.py timeout set to 45 selftests: drivers/net: stats.py KTAP version 1 1..5 ok 1 stats.check_pause ok 2 stats.check_fec ok 3 stats.pkt_byte_sum ok 4 stats.qstat_by_ifindex ok 5 stats.check_down # Totals: pass:5 fail:0 xfail:0 xpass:0 skip:0 error:0 Fixes: ab63a2387cb9 ("netdev: add per-queue statistics") Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241213152244.3080955-3-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 9f086b190619..1be8c7c21d19 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -668,7 +668,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->rxq_idx = i++; + ctx->rxq_idx = ++i; } i = ctx->txq_idx; while (ops->get_queue_stats_tx && i < netdev->real_num_tx_queues) { @@ -676,7 +676,7 @@ netdev_nl_stats_by_queue(struct net_device *netdev, struct sk_buff *rsp, i, info); if (err) return err; - ctx->txq_idx = i++; + ctx->txq_idx = ++i; } ctx->rxq_idx = 0; -- cgit v1.2.3 From 954a2b40719a21e763a1bba2f0da92347e058fce Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 16 Dec 2024 20:04:32 +0900 Subject: rtnetlink: Try the outer netns attribute in rtnl_get_peer_net(). Xiao Liang reported that the cited commit changed netns handling in newlink() of netkit, veth, and vxcan. Before the patch, if we don't find a netns attribute in the peer device attributes, we tried to find another netns attribute in the outer netlink attributes by passing it to rtnl_link_get_net(). Let's restore the original behaviour. Fixes: 48327566769a ("rtnetlink: fix double call of rtnl_link_get_net_ifla()") Reported-by: Xiao Liang Closes: https://lore.kernel.org/netdev/CABAhCORBVVU8P6AHcEkENMj+gD2d3ce9t=A_o48E0yOQp8_wUQ@mail.gmail.com/#t Signed-off-by: Kuniyuki Iwashima Tested-by: Xiao Liang Link: https://patch.msgid.link/20241216110432.51488-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/rtnetlink.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index ebcfc2debf1a..d9f959c619d9 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3819,6 +3819,7 @@ out_unregister: } static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, + struct nlattr *tbp[], struct nlattr *data[], struct netlink_ext_ack *extack) { @@ -3826,7 +3827,7 @@ static struct net *rtnl_get_peer_net(const struct rtnl_link_ops *ops, int err; if (!data || !data[ops->peer_type]) - return NULL; + return rtnl_link_get_net_ifla(tbp); err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack); if (err < 0) @@ -3971,7 +3972,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, } if (ops->peer_type) { - peer_net = rtnl_get_peer_net(ops, data, extack); + peer_net = rtnl_get_peer_net(ops, tb, data, extack); if (IS_ERR(peer_net)) { ret = PTR_ERR(peer_net); goto put_ops; -- cgit v1.2.3 From cf2c97423a4f89c8b798294d3f34ecfe7e7035c3 Mon Sep 17 00:00:00 2001 From: David Laight Date: Sat, 14 Dec 2024 17:30:53 +0000 Subject: ipvs: Fix clamp() of ip_vs_conn_tab on small memory systems The 'max_avail' value is calculated from the system memory size using order_base_2(). order_base_2(x) is defined as '(x) ? fn(x) : 0'. The compiler generates two copies of the code that follows and then expands clamp(max, min, PAGE_SHIFT - 12) (11 on 32bit). This triggers a compile-time assert since min is 5. In reality a system would have to have less than 512MB memory for the bounds passed to clamp to be reversed. Swap the order of the arguments to clamp() to avoid the warning. Replace the clamp_val() on the line below with clamp(). clamp_val() is just 'an accident waiting to happen' and not needed here. Detected by compile time checks added to clamp(), specifically: minmax.h: use BUILD_BUG_ON_MSG() for the lo < hi test in clamp() Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/all/CA+G9fYsT34UkGFKxus63H6UVpYi5GRZkezT9MRLfAbM3f6ke0g@mail.gmail.com/ Fixes: 4f325e26277b ("ipvs: dynamically limit the connection hash table") Tested-by: Bartosz Golaszewski Reviewed-by: Bartosz Golaszewski Signed-off-by: David Laight Acked-by: Julian Anastasov Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_conn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 98d7dbe3d787..c0289f83f96d 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1495,8 +1495,8 @@ int __init ip_vs_conn_init(void) max_avail -= 2; /* ~4 in hash row */ max_avail -= 1; /* IPVS up to 1/2 of mem */ max_avail -= order_base_2(sizeof(struct ip_vs_conn)); - max = clamp(max, min, max_avail); - ip_vs_conn_tab_bits = clamp_val(ip_vs_conn_tab_bits, min, max); + max = clamp(max_avail, min, max); + ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max); ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; ip_vs_conn_tab_mask = ip_vs_conn_tab_size - 1; -- cgit v1.2.3 From 70b6f46a4ed8bd56c85ffff22df91e20e8c85e33 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Tue, 17 Dec 2024 20:56:55 +0100 Subject: netfilter: ipset: Fix for recursive locking warning With CONFIG_PROVE_LOCKING, when creating a set of type bitmap:ip, adding it to a set of type list:set and populating it from iptables SET target triggers a kernel warning: | WARNING: possible recursive locking detected | 6.12.0-rc7-01692-g5e9a28f41134-dirty #594 Not tainted | -------------------------------------------- | ping/4018 is trying to acquire lock: | ffff8881094a6848 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] | | but task is already holding lock: | ffff88811034c048 (&set->lock){+.-.}-{2:2}, at: ip_set_add+0x28c/0x360 [ip_set] This is a false alarm: ipset does not allow nested list:set type, so the loop in list_set_kadd() can never encounter the outer set itself. No other set type supports embedded sets, so this is the only case to consider. To avoid the false report, create a distinct lock class for list:set type ipset locks. Fixes: f830837f0eed ("netfilter: ipset: list:set set type support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipset/ip_set_list_set.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c index bfae7066936b..db794fe1300e 100644 --- a/net/netfilter/ipset/ip_set_list_set.c +++ b/net/netfilter/ipset/ip_set_list_set.c @@ -611,6 +611,8 @@ init_list_set(struct net *net, struct ip_set *set, u32 size) return true; } +static struct lock_class_key list_set_lockdep_key; + static int list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], u32 flags) @@ -627,6 +629,7 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[], if (size < IP_SET_LIST_MIN_SIZE) size = IP_SET_LIST_MIN_SIZE; + lockdep_set_class(&set->lock, &list_set_lockdep_key); set->variant = &set_variant; set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem), __alignof__(struct set_elem)); -- cgit v1.2.3 From 16f027cd40eeedd2325f7e720689462ca8d9d13e Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 16 Dec 2024 15:50:59 +0200 Subject: net: dsa: restore dsa_software_vlan_untag() ability to operate on VLAN-untagged traffic Robert Hodaszi reports that locally terminated traffic towards VLAN-unaware bridge ports is broken with ocelot-8021q. He is describing the same symptoms as for commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). For context, the set merged as "VLAN fixes for Ocelot driver": https://lore.kernel.org/netdev/20240815000707.2006121-1-vladimir.oltean@nxp.com/ was developed in a slightly different form earlier this year, in January. Initially, the switch was unconditionally configured to set OCELOT_ES0_TAG when using ocelot-8021q, regardless of port operating mode. This led to the situation where VLAN-unaware bridge ports would always push their PVID - see ocelot_vlan_unaware_pvid() - a negligible value anyway - into RX packets. To strip this in software, we would have needed DSA to know what private VID the switch chose for VLAN-unaware bridge ports, and pushed into the packets. This was implemented downstream, and a remnant of it remains in the form of a comment mentioning ds->ops->get_private_vid(), as something which would maybe need to be considered in the future. However, for upstream, it was deemed inappropriate, because it would mean introducing yet another behavior for stripping VLAN tags from VLAN-unaware bridge ports, when one already existed (ds->untag_bridge_pvid). The latter has been marked as obsolete along with an explanation why it is logically broken, but still, it would have been confusing. So, for upstream, felix_update_tag_8021q_rx_rule() was developed, which essentially changed the state of affairs from "Felix with ocelot-8021q delivers all packets as VLAN-tagged towards the CPU" into "Felix with ocelot-8021q delivers all packets from VLAN-aware bridge ports towards the CPU". This was done on the premise that in VLAN-unaware mode, there's nothing useful in the VLAN tags, and we can avoid introducing ds->ops->get_private_vid() in the DSA receive path if we configure the switch to not push those VLAN tags into packets in the first place. Unfortunately, and this is when the trainwreck started, the selftests developed initially and posted with the series were not re-ran. dsa_software_vlan_untag() was initially written given the assumption that users of this feature would send _all_ traffic as VLAN-tagged. It was only partially adapted to the new scheme, by removing ds->ops->get_private_vid(), which also used to be necessary in standalone ports mode. Where the trainwreck became even worse is that I had a second opportunity to think about this, when the dsa_software_vlan_untag() logic change initially broke sja1105, in commit 1f9fc48fd302 ("net: dsa: sja1105: fix reception from VLAN-unaware bridges"). I did not connect the dots that it also breaks ocelot-8021q, for pretty much the same reason that not all received packets will be VLAN-tagged. To be compatible with the optimized Felix control path which runs felix_update_tag_8021q_rx_rule() to only push VLAN tags when useful (in VLAN-aware mode), we need to restore the old dsa_software_vlan_untag() logic. The blamed commit introduced the assumption that dsa_software_vlan_untag() will see only VLAN-tagged packets, assumption which is false. What corrupts RX traffic is the fact that we call skb_vlan_untag() on packets which are not VLAN-tagged in the first place. Fixes: 93e4649efa96 ("net: dsa: provide a software untagging function on RX for VLAN-aware bridges") Reported-by: Robert Hodaszi Closes: https://lore.kernel.org/netdev/20241215163334.615427-1-robert.hodaszi@digi.com/ Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20241216135059.1258266-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- net/dsa/tag.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dsa/tag.h b/net/dsa/tag.h index d5707870906b..5d80ddad4ff6 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -138,9 +138,10 @@ static inline void dsa_software_untag_vlan_unaware_bridge(struct sk_buff *skb, * dsa_software_vlan_untag: Software VLAN untagging in DSA receive path * @skb: Pointer to socket buffer (packet) * - * Receive path method for switches which cannot avoid tagging all packets - * towards the CPU port. Called when ds->untag_bridge_pvid (legacy) or - * ds->untag_vlan_aware_bridge_pvid is set to true. + * Receive path method for switches which send some packets as VLAN-tagged + * towards the CPU port (generally from VLAN-aware bridge ports) even when the + * packet was not tagged on the wire. Called when ds->untag_bridge_pvid + * (legacy) or ds->untag_vlan_aware_bridge_pvid is set to true. * * As a side effect of this method, any VLAN tag from the skb head is moved * to hwaccel. @@ -149,14 +150,19 @@ static inline struct sk_buff *dsa_software_vlan_untag(struct sk_buff *skb) { struct dsa_port *dp = dsa_user_to_port(skb->dev); struct net_device *br = dsa_port_bridge_dev_get(dp); - u16 vid; + u16 vid, proto; + int err; /* software untagging for standalone ports not yet necessary */ if (!br) return skb; + err = br_vlan_get_proto(br, &proto); + if (err) + return skb; + /* Move VLAN tag from data to hwaccel */ - if (!skb_vlan_tag_present(skb)) { + if (!skb_vlan_tag_present(skb) && skb->protocol == htons(proto)) { skb = skb_vlan_untag(skb); if (!skb) return NULL; -- cgit v1.2.3 From 5eb70dbebf32c2fd1f2814c654ae17fc47d6e859 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 17 Dec 2024 18:25:08 -0800 Subject: netdev-genl: avoid empty messages in queue dump Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down it has no queues. Fixes: 6b6171db7fc8 ("netdev-genl: Add netlink framework functions for queue") Reported-by: syzbot+0a884bc2d304ce4af70f@syzkaller.appspotmail.com Reviewed-by: Eric Dumazet Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241218022508.815344-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 1be8c7c21d19..2d3ae0cd3ad2 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -430,10 +430,10 @@ static int netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx, u32 q_type, const struct genl_info *info) { - int err = 0; + int err; if (!(netdev->flags & IFF_UP)) - return err; + return -ENOENT; err = netdev_nl_queue_validate(netdev, q_idx, q_type); if (err) -- cgit v1.2.3 From 5eecd85c77a254a43bde3212da8047b001745c9f Mon Sep 17 00:00:00 2001 From: Adrian Moreno Date: Tue, 17 Dec 2024 12:37:39 +0100 Subject: psample: adjust size if rate_as_probability is set If PSAMPLE_ATTR_SAMPLE_PROBABILITY flag is to be sent, the available size for the packet data has to be adjusted accordingly. Also, check the error code returned by nla_put_flag. Fixes: 7b1b2b60c63f ("net: psample: allow using rate as probability") Signed-off-by: Adrian Moreno Reviewed-by: Aaron Conole Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241217113739.3929300-1-amorenoz@redhat.com Signed-off-by: Jakub Kicinski --- net/psample/psample.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/psample/psample.c b/net/psample/psample.c index a0ddae8a65f9..25f92ba0840c 100644 --- a/net/psample/psample.c +++ b/net/psample/psample.c @@ -393,7 +393,9 @@ void psample_sample_packet(struct psample_group *group, nla_total_size_64bit(sizeof(u64)) + /* timestamp */ nla_total_size(sizeof(u16)) + /* protocol */ (md->user_cookie_len ? - nla_total_size(md->user_cookie_len) : 0); /* user cookie */ + nla_total_size(md->user_cookie_len) : 0) + /* user cookie */ + (md->rate_as_probability ? + nla_total_size(0) : 0); /* rate as probability */ #ifdef CONFIG_INET tun_info = skb_tunnel_info(skb); @@ -498,8 +500,9 @@ void psample_sample_packet(struct psample_group *group, md->user_cookie)) goto error; - if (md->rate_as_probability) - nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY); + if (md->rate_as_probability && + nla_put_flag(nl_skb, PSAMPLE_ATTR_SAMPLE_PROBABILITY)) + goto error; genlmsg_end(nl_skb, data); genlmsg_multicast_netns(&psample_nl_family, group->net, nl_skb, 0, -- cgit v1.2.3 From ce1219c3f76bb131d095e90521506d3c6ccfa086 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 18 Dec 2024 11:53:01 +0800 Subject: net: mctp: handle skb cleanup on sock_queue failures Currently, we don't use the return value from sock_queue_rcv_skb, which means we may leak skbs if a message is not successfully queued to a socket. Instead, ensure that we're freeing the skb where the sock hasn't otherwise taken ownership of the skb by adding checks on the sock_queue_rcv_skb() to invoke a kfree on failure. In doing so, rather than using the 'rc' value to trigger the kfree_skb(), use the skb pointer itself, which is more explicit. Also, add a kunit test for the sock delivery failure cases. Fixes: 4a992bbd3650 ("mctp: Implement message fragmentation & reassembly") Cc: stable@vger.kernel.org Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20241218-mctp-next-v2-1-1c1729645eaa@codeconstruct.com.au Signed-off-by: Paolo Abeni --- net/mctp/route.c | 36 +++++++++++++------ net/mctp/test/route-test.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mctp/route.c b/net/mctp/route.c index 597e9cf5aa64..3f2bd65ff5e3 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -374,8 +374,13 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) msk = NULL; rc = -EINVAL; - /* we may be receiving a locally-routed packet; drop source sk - * accounting + /* We may be receiving a locally-routed packet; drop source sk + * accounting. + * + * From here, we will either queue the skb - either to a frag_queue, or + * to a receiving socket. When that succeeds, we clear the skb pointer; + * a non-NULL skb on exit will be otherwise unowned, and hence + * kfree_skb()-ed. */ skb_orphan(skb); @@ -434,7 +439,9 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * pending key. */ if (flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(&msk->sk, skb); + rc = sock_queue_rcv_skb(&msk->sk, skb); + if (!rc) + skb = NULL; if (key) { /* we've hit a pending reassembly; not much we * can do but drop it @@ -443,7 +450,6 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) MCTP_TRACE_KEY_REPLIED); key = NULL; } - rc = 0; goto out_unlock; } @@ -470,8 +476,10 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) * this function. */ rc = mctp_key_add(key, msk); - if (!rc) + if (!rc) { trace_mctp_key_acquire(key); + skb = NULL; + } /* we don't need to release key->lock on exit, so * clean up here and suppress the unlock via @@ -489,6 +497,8 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) key = NULL; } else { rc = mctp_frag_queue(key, skb); + if (!rc) + skb = NULL; } } @@ -503,12 +513,19 @@ static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) else rc = mctp_frag_queue(key, skb); + if (rc) + goto out_unlock; + + /* we've queued; the queue owns the skb now */ + skb = NULL; + /* end of message? deliver to socket, and we're done with * the reassembly/response key */ - if (!rc && flags & MCTP_HDR_FLAG_EOM) { - sock_queue_rcv_skb(key->sk, key->reasm_head); - key->reasm_head = NULL; + if (flags & MCTP_HDR_FLAG_EOM) { + rc = sock_queue_rcv_skb(key->sk, key->reasm_head); + if (!rc) + key->reasm_head = NULL; __mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED); key = NULL; } @@ -527,8 +544,7 @@ out_unlock: if (any_key) mctp_key_unref(any_key); out: - if (rc) - kfree_skb(skb); + kfree_skb(skb); return rc; } diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 8551dab1d1e6..17165b86ce22 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -837,6 +837,90 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) mctp_test_route_input_multiple_nets_key_fini(test, &t2); } +/* Input route to socket, using a single-packet message, where sock delivery + * fails. Ensure we're handling the failure appropriately. + */ +static void mctp_test_route_input_sk_fail_single(struct kunit *test) +{ + const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct socket *sock; + struct sk_buff *skb; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will + * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. + */ + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + skb = mctp_test_create_skb(&hdr, 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + skb_get(skb); + + mctp_test_skb_set_dev(skb, dev); + + /* do route input, which should fail */ + rc = mctp_route_input(&rt->rt, skb); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to skb */ + KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); + kfree_skb(skb); + + __mctp_route_test_fini(test, dev, rt, sock); +} + +/* Input route to socket, using a fragmented message, where sock delivery fails. + */ +static void mctp_test_route_input_sk_fail_frag(struct kunit *test) +{ + const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; + struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct sk_buff *skbs[2]; + struct socket *sock; + unsigned int i; + int rc; + + __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + + lock_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvbuf, 0); + release_sock(sock->sk); + + for (i = 0; i < ARRAY_SIZE(skbs); i++) { + skbs[i] = mctp_test_create_skb(&hdrs[i], 10); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skbs[i]); + skb_get(skbs[i]); + + mctp_test_skb_set_dev(skbs[i], dev); + } + + /* first route input should succeed, we're only queueing to the + * frag list + */ + rc = mctp_route_input(&rt->rt, skbs[0]); + KUNIT_EXPECT_EQ(test, rc, 0); + + /* final route input should fail to deliver to the socket */ + rc = mctp_route_input(&rt->rt, skbs[1]); + KUNIT_EXPECT_NE(test, rc, 0); + + /* we should hold the only reference to both skbs */ + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[0]->users), 1); + kfree_skb(skbs[0]); + + KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); + kfree_skb(skbs[1]); + + __mctp_route_test_fini(test, dev, rt, sock); +} + #if IS_ENABLED(CONFIG_MCTP_FLOWS) static void mctp_test_flow_init(struct kunit *test, @@ -1053,6 +1137,8 @@ static struct kunit_case mctp_test_cases[] = { mctp_route_input_sk_reasm_gen_params), KUNIT_CASE_PARAM(mctp_test_route_input_sk_keys, mctp_route_input_sk_keys_gen_params), + KUNIT_CASE(mctp_test_route_input_sk_fail_single), + KUNIT_CASE(mctp_test_route_input_sk_fail_frag), KUNIT_CASE(mctp_test_route_input_multiple_nets_bind), KUNIT_CASE(mctp_test_route_input_multiple_nets_key), KUNIT_CASE(mctp_test_packet_flow), -- cgit v1.2.3 From 54f89b3178d5448dd4457afbb98fc1ab99090a65 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 10 Dec 2024 01:20:38 +0000 Subject: tcp_bpf: Charge receive socket buffer in bpf_tcp_ingress() When bpf_tcp_ingress() is called, the skmsg is being redirected to the ingress of the destination socket. Therefore, we should charge its receive socket buffer, instead of sending socket buffer. Because sk_rmem_schedule() tests pfmemalloc of skb, we need to introduce a wrapper and call it for skmsg. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-2-zijianzhang@bytedance.com --- include/net/sock.h | 10 ++++++++-- net/ipv4/tcp_bpf.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 7464e9f9f47c..c383126f691d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1527,7 +1527,7 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +__sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) { int delta; @@ -1535,7 +1535,13 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) return true; delta = size - sk->sk_forward_alloc; return delta <= 0 || __sk_mem_schedule(sk, delta, SK_MEM_RECV) || - skb_pfmemalloc(skb); + pfmemalloc; +} + +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +{ + return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } static inline int sk_unused_reserved_mem(const struct sock *sk) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 99cef92e6290..b21ea634909c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -49,7 +49,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; - if (!sk_wmem_schedule(sk, size)) { + if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; -- cgit v1.2.3 From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- net/core/skmsg.c | 6 +++++- net/ipv4/tcp_bpf.c | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7..2cbe0c22a32f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e90fbab703b2..8ad7e6755fd6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b21ea634909c..392678ae80f4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -56,6 +56,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, } sk_mem_charge(sk, size); + atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) @@ -74,7 +75,8 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - sk_psock_queue_msg(psock, tmp); + if (!sk_psock_queue_msg(psock, tmp)) + atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); -- cgit v1.2.3 From 4a25201aa46ce88e8e31f9ccdec0e4e3dd6bb736 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 18 Dec 2024 19:28:32 -0800 Subject: netdev-genl: avoid empty messages in napi get Empty netlink responses from do() are not correct (as opposed to dump() where not dumping anything is perfectly fine). We should return an error if the target object does not exist, in this case if the netdev is down we "hide" the NAPI instances. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241219032833.1165433-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 2d3ae0cd3ad2..b0772d135efb 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -246,8 +246,12 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rcu_read_unlock(); rtnl_unlock(); - if (err) + if (err) { + goto err_free_msg; + } else if (!rsp->len) { + err = -ENOENT; goto err_free_msg; + } return genlmsg_reply(rsp, info); -- cgit v1.2.3 From fdf478d236dcf0f1f68534df5d456ced625195bd Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:22 +0800 Subject: skmsg: Return copied bytes in sk_msg_memcopy_from_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously sk_msg_memcopy_from_iter returns the copied bytes from the last copy_from_iter{,_nocache} call upon success. This commit changes it to return the total number of copied bytes on success. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-1-bae583d014f3@outlook.com --- net/core/skmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8ad7e6755fd6..61f3f3d4e528 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -- cgit v1.2.3 From 5153a75ef34b3f7478ca918044d0f05eed8fb3f9 Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:23 +0800 Subject: tcp_bpf: Fix copied value in tcp_bpf_sendmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf kselftest sockhash::test_txmsg_cork_hangs in test_sockmap.c triggers a kernel NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000008 ? __die_body+0x6e/0xb0 ? __die+0x8b/0xa0 ? page_fault_oops+0x358/0x3c0 ? local_clock+0x19/0x30 ? lock_release+0x11b/0x440 ? kernelmode_fixup_or_oops+0x54/0x60 ? __bad_area_nosemaphore+0x4f/0x210 ? mmap_read_unlock+0x13/0x30 ? bad_area_nosemaphore+0x16/0x20 ? do_user_addr_fault+0x6fd/0x740 ? prb_read_valid+0x1d/0x30 ? exc_page_fault+0x55/0xd0 ? asm_exc_page_fault+0x2b/0x30 ? splice_to_socket+0x52e/0x630 ? shmem_file_splice_read+0x2b1/0x310 direct_splice_actor+0x47/0x70 splice_direct_to_actor+0x133/0x300 ? do_splice_direct+0x90/0x90 do_splice_direct+0x64/0x90 ? __ia32_sys_tee+0x30/0x30 do_sendfile+0x214/0x300 __se_sys_sendfile64+0x8e/0xb0 __x64_sys_sendfile64+0x25/0x30 x64_sys_call+0xb82/0x2840 do_syscall_64+0x75/0x110 entry_SYSCALL_64_after_hwframe+0x4b/0x53 This is caused by tcp_bpf_sendmsg() returning a larger value(12289) than size (8192), which causes the while loop in splice_to_socket() to release an uninitialized pipe buf. The underlying cause is that this code assumes sk_msg_memcopy_from_iter() will copy all bytes upon success but it actually might only copy part of it. This commit changes it to use the real copied bytes. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-2-bae583d014f3@outlook.com --- net/ipv4/tcp_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 392678ae80f4..47f65b1b70ca 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -495,7 +495,7 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int copied = 0, err = 0; + int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; int flags; @@ -538,14 +538,14 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = msg_tx->sg.size - osize; } - err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); - if (err < 0) { + if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } - copied += copy; + copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; -- cgit v1.2.3 From 9ecc4d858b92c1bb0673ad9c327298e600c55659 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:54 -0800 Subject: bpf: Check negative offsets in __bpf_skb_min_len() skb_network_offset() and skb_transport_offset() can be negative when they are called after we pull the transport header, for example, when we use eBPF sockmap at the point of ->sk_data_ready(). __bpf_skb_min_len() uses an unsigned int to get these offsets, this leads to a very large number which then causes bpf_skb_change_tail() failed unexpectedly. Fix this by using a signed int to get these offsets and ensure the minimum is at least zero. Fixes: 5293efe62df8 ("bpf: add bpf_skb_change_tail helper") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-2-xiyou.wangcong@gmail.com --- net/core/filter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 21131ec25f24..834614071727 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3734,13 +3734,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } -- cgit v1.2.3 From 4f4aa4aa28142d53f8b06585c478476cfe325cfc Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 19 Dec 2024 15:28:59 +0800 Subject: net: fix memory leak in tcp_conn_request() If inet_csk_reqsk_queue_hash_add() return false, tcp_conn_request() will return without free the dst memory, which allocated in af_ops->route_req. Here is the kmemleak stack: unreferenced object 0xffff8881198631c0 (size 240): comm "softirq", pid 0, jiffies 4299266571 (age 1802.392s) hex dump (first 32 bytes): 00 10 9b 03 81 88 ff ff 80 98 da bc ff ff ff ff ................ 81 55 18 bb ff ff ff ff 00 00 00 00 00 00 00 00 .U.............. backtrace: [] kmem_cache_alloc+0x60c/0xa80 [] dst_alloc+0x55/0x250 [] rt_dst_alloc+0x46/0x1d0 [] __mkroute_output+0x29a/0xa50 [] ip_route_output_key_hash+0x10b/0x240 [] ip_route_output_flow+0x1d/0x90 [] inet_csk_route_req+0x2c5/0x500 [] tcp_conn_request+0x691/0x12c0 [] tcp_rcv_state_process+0x3c8/0x11b0 [] tcp_v4_do_rcv+0x156/0x3b0 [] tcp_v4_rcv+0x1cf8/0x1d80 [] ip_protocol_deliver_rcu+0xf6/0x360 [] ip_local_deliver_finish+0xe6/0x1e0 [] ip_local_deliver+0xee/0x360 [] ip_rcv+0xad/0x2f0 [] __netif_receive_skb_one_core+0x123/0x140 Call dst_release() to free the dst memory when inet_csk_reqsk_queue_hash_add() return false in tcp_conn_request(). Fixes: ff46e3b44219 ("Fix race for duplicate reqsk on identical SYN") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20241219072859.3783576-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_input.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5bdf13ac26ef..4811727b8a02 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -7328,6 +7328,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, req->timeout))) { reqsk_free(req); + dst_release(dst); return 0; } -- cgit v1.2.3 From b5a7b661a073727219fedc35f5619f62418ffe72 Mon Sep 17 00:00:00 2001 From: Xiao Liang Date: Thu, 19 Dec 2024 21:03:36 +0800 Subject: net: Fix netns for ip_tunnel_init_flow() The device denoted by tunnel->parms.link resides in the underlay net namespace. Therefore pass tunnel->net to ip_tunnel_init_flow(). Fixes: db53cd3d88dc ("net: Handle l3mdev in ip_tunnel_init_flow") Signed-off-by: Xiao Liang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20241219130336.103839-1-shaw.leon@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c | 3 +-- net/ipv4/ip_tunnel.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c index 4b5fd71c897d..32d2e61f2b82 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_span.c @@ -423,8 +423,7 @@ mlxsw_sp_span_gretap4_route(const struct net_device *to_dev, parms = mlxsw_sp_ipip_netdev_parms4(to_dev); ip_tunnel_init_flow(&fl4, parms.iph.protocol, *daddrp, *saddrp, - 0, 0, dev_net(to_dev), parms.link, tun->fwmark, 0, - 0); + 0, 0, tun->net, parms.link, tun->fwmark, 0, 0); rt = ip_route_output_key(tun->net, &fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 25505f9b724c..09b73acf037a 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -294,7 +294,7 @@ static int ip_tunnel_bind_dev(struct net_device *dev) ip_tunnel_init_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - iph->tos & INET_DSCP_MASK, dev_net(dev), + iph->tos & INET_DSCP_MASK, tunnel->net, tunnel->parms.link, tunnel->fwmark, 0, 0); rt = ip_route_output_key(tunnel->net, &fl4); @@ -611,7 +611,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } ip_tunnel_init_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, tunnel_id_to_key32(key->tun_id), - tos & INET_DSCP_MASK, dev_net(dev), 0, skb->mark, + tos & INET_DSCP_MASK, tunnel->net, 0, skb->mark, skb_get_hash(skb), key->flow_flags); if (!tunnel_hlen) @@ -774,7 +774,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, tos & INET_DSCP_MASK, - dev_net(dev), READ_ONCE(tunnel->parms.link), + tunnel->net, READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) -- cgit v1.2.3 From a4fd163aed2edd967a244499754dec991d8b4c7d Mon Sep 17 00:00:00 2001 From: Ilya Shchipletsov Date: Thu, 19 Dec 2024 08:23:07 +0000 Subject: netrom: check buffer length before accessing it Syzkaller reports an uninit value read from ax25cmp when sending raw message through ieee802154 implementation. ===================================================== BUG: KMSAN: uninit-value in ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 ax25cmp+0x3a5/0x460 net/ax25/ax25_addr.c:119 nr_dev_get+0x20e/0x450 net/netrom/nr_route.c:601 nr_route_frame+0x1a2/0xfc0 net/netrom/nr_route.c:774 nr_xmit+0x5a/0x1c0 net/netrom/nr_dev.c:144 __netdev_start_xmit include/linux/netdevice.h:4940 [inline] netdev_start_xmit include/linux/netdevice.h:4954 [inline] xmit_one net/core/dev.c:3548 [inline] dev_hard_start_xmit+0x247/0xa10 net/core/dev.c:3564 __dev_queue_xmit+0x33b8/0x5130 net/core/dev.c:4349 dev_queue_xmit include/linux/netdevice.h:3134 [inline] raw_sendmsg+0x654/0xc10 net/ieee802154/socket.c:299 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b Uninit was created at: slab_post_alloc_hook+0x129/0xa70 mm/slab.h:768 slab_alloc_node mm/slub.c:3478 [inline] kmem_cache_alloc_node+0x5e9/0xb10 mm/slub.c:3523 kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:560 __alloc_skb+0x318/0x740 net/core/skbuff.c:651 alloc_skb include/linux/skbuff.h:1286 [inline] alloc_skb_with_frags+0xc8/0xbd0 net/core/skbuff.c:6334 sock_alloc_send_pskb+0xa80/0xbf0 net/core/sock.c:2780 sock_alloc_send_skb include/net/sock.h:1884 [inline] raw_sendmsg+0x36d/0xc10 net/ieee802154/socket.c:282 ieee802154_sock_sendmsg+0x91/0xc0 net/ieee802154/socket.c:96 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg net/socket.c:745 [inline] ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638 __sys_sendmsg net/socket.c:2667 [inline] __do_sys_sendmsg net/socket.c:2676 [inline] __se_sys_sendmsg net/socket.c:2674 [inline] __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x63/0x6b CPU: 0 PID: 5037 Comm: syz-executor166 Not tainted 6.7.0-rc7-syzkaller-00003-gfbafc3e621c3 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 ===================================================== This issue occurs because the skb buffer is too small, and it's actual allocation is aligned. This hides an actual issue, which is that nr_route_frame does not validate the buffer size before using it. Fix this issue by checking skb->len before accessing any fields in skb->data. Found by Linux Verification Center (linuxtesting.org) with Syzkaller. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Co-developed-by: Nikita Marushkin Signed-off-by: Nikita Marushkin Signed-off-by: Ilya Shchipletsov Link: https://patch.msgid.link/20241219082308.3942-1-rabbelkin@mail.ru Signed-off-by: Jakub Kicinski --- net/netrom/nr_route.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c index 2b5e246b8d9a..b94cb2ffbaf8 100644 --- a/net/netrom/nr_route.c +++ b/net/netrom/nr_route.c @@ -754,6 +754,12 @@ int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) int ret; struct sk_buff *skbn; + /* + * Reject malformed packets early. Check that it contains at least 2 + * addresses and 1 byte more for Time-To-Live + */ + if (skb->len < 2 * sizeof(ax25_address) + 1) + return 0; nr_src = (ax25_address *)(skb->data + 0); nr_dest = (ax25_address *)(skb->data + 7); -- cgit v1.2.3 From 4e86729d1ff329815a6e8a920cb554a1d4cb5b8d Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Thu, 19 Dec 2024 19:21:14 +0300 Subject: net/sctp: Prevent autoclose integer overflow in sctp_association_init() While by default max_autoclose equals to INT_MAX / HZ, one may set net.sctp.max_autoclose to UINT_MAX. There is code in sctp_association_init() that can consequently trigger overflow. Cc: stable@vger.kernel.org Fixes: 9f70f46bd4c7 ("sctp: properly latch and use autoclose value from sock to association") Signed-off-by: Nikolay Kuratov Acked-by: Xin Long Link: https://patch.msgid.link/20241219162114.2863827-1-kniv@yandex-team.ru Signed-off-by: Jakub Kicinski --- net/sctp/associola.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index c45c192b7878..0b0794f164cf 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -137,7 +137,8 @@ static struct sctp_association *sctp_association_init( = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; - asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; + asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + (unsigned long)sp->autoclose * HZ; /* Initializes the timers */ for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) -- cgit v1.2.3 From a024e377efed31ecfb39210bed562932321345b3 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Tue, 24 Dec 2024 20:07:20 -0500 Subject: net: llc: reset skb->transport_header 802.2+LLC+SNAP frames received by napi_complete_done with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. As snap_rcv expects transport_header to point to SNAP header (OID:PID) after LLC processing advances offset over LLC header (llc_rcv & llc_fixup_skb), code doesn't find a match and packet is dropped. Between napi_complete_done and snap_rcv, transport_header is not used until __netif_receive_skb_core, where originally it was being reset. Commit fda55eca5a33 ("net: introduce skb_transport_header_was_set()") only does so if not set, on the assumption the value was set correctly by GRO (and also on assumption that "network stacks usually reset the transport header anyway"). Afterwards it is moved forward by llc_fixup_skb. Locally generated traffic shows up at __netif_receive_skb_core with no transport_header set and is processed without issue. On a setup with GRO but no DSA, transport_header and network_header are both set to point to skb->data which is also correct. As issue is LLC specific, to avoid impacting non-LLC traffic, and to follow up on original assumption made on previous code change, llc_fixup_skb to reset the offset after skb pull. llc_fixup_skb assumes the LLC header is at skb->data, and by definition SNAP header immediately follows. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241225010723.2830290-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/llc/llc_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 51bccfb00a9c..61b0159b2fbe 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -124,8 +124,8 @@ static inline int llc_fixup_skb(struct sk_buff *skb) if (unlikely(!pskb_may_pull(skb, llc_len))) return 0; - skb->transport_header += llc_len; skb_pull(skb, llc_len); + skb_reset_transport_header(skb); if (skb->protocol == htons(ETH_P_802_2)) { __be16 pdulen; s32 data_size; -- cgit v1.2.3 From cbb26f7d8451fe56ccac802c6db48d16240feebd Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sat, 21 Dec 2024 09:51:46 +0100 Subject: mptcp: fix TCP options overflow. Syzbot reported the following splat: Oops: general protection fault, probably for non-canonical address 0xdffffc0000000001: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000008-0x000000000000000f] CPU: 1 UID: 0 PID: 5836 Comm: sshd Not tainted 6.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/25/2024 RIP: 0010:_compound_head include/linux/page-flags.h:242 [inline] RIP: 0010:put_page+0x23/0x260 include/linux/mm.h:1552 Code: 90 90 90 90 90 90 90 55 41 57 41 56 53 49 89 fe 48 bd 00 00 00 00 00 fc ff df e8 f8 5e 12 f8 49 8d 5e 08 48 89 d8 48 c1 e8 03 <80> 3c 28 00 74 08 48 89 df e8 8f c7 78 f8 48 8b 1b 48 89 de 48 83 RSP: 0000:ffffc90003916c90 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000008 RCX: ffff888030458000 RDX: 0000000000000100 RSI: 0000000000000000 RDI: 0000000000000000 RBP: dffffc0000000000 R08: ffffffff898ca81d R09: 1ffff110054414ac R10: dffffc0000000000 R11: ffffed10054414ad R12: 0000000000000007 R13: ffff88802a20a542 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f34f496e800(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f9d6ec9ec28 CR3: 000000004d260000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_page_unref include/linux/skbuff_ref.h:43 [inline] __skb_frag_unref include/linux/skbuff_ref.h:56 [inline] skb_release_data+0x483/0x8a0 net/core/skbuff.c:1119 skb_release_all net/core/skbuff.c:1190 [inline] __kfree_skb+0x55/0x70 net/core/skbuff.c:1204 tcp_clean_rtx_queue net/ipv4/tcp_input.c:3436 [inline] tcp_ack+0x2442/0x6bc0 net/ipv4/tcp_input.c:4032 tcp_rcv_state_process+0x8eb/0x44e0 net/ipv4/tcp_input.c:6805 tcp_v4_do_rcv+0x77d/0xc70 net/ipv4/tcp_ipv4.c:1939 tcp_v4_rcv+0x2dc0/0x37f0 net/ipv4/tcp_ipv4.c:2351 ip_protocol_deliver_rcu+0x22e/0x440 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x341/0x5f0 net/ipv4/ip_input.c:233 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 NF_HOOK+0x3a4/0x450 include/linux/netfilter.h:314 __netif_receive_skb_one_core net/core/dev.c:5672 [inline] __netif_receive_skb+0x2bf/0x650 net/core/dev.c:5785 process_backlog+0x662/0x15b0 net/core/dev.c:6117 __napi_poll+0xcb/0x490 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0x89b/0x1240 net/core/dev.c:7074 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0xf7/0x220 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0x57/0xc0 arch/x86/kernel/apic/apic.c:1049 asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702 RIP: 0033:0x7f34f4519ad5 Code: 85 d2 74 0d 0f 10 02 48 8d 54 24 20 0f 11 44 24 20 64 8b 04 25 18 00 00 00 85 c0 75 27 41 b8 08 00 00 00 b8 0f 01 00 00 0f 05 <48> 3d 00 f0 ff ff 76 75 48 8b 15 24 73 0d 00 f7 d8 64 89 02 48 83 RSP: 002b:00007ffec5b32ce0 EFLAGS: 00000246 RAX: 0000000000000001 RBX: 00000000000668a0 RCX: 00007f34f4519ad5 RDX: 00007ffec5b32d00 RSI: 0000000000000004 RDI: 0000564f4bc6cae0 RBP: 0000564f4bc6b5a0 R08: 0000000000000008 R09: 0000000000000000 R10: 00007ffec5b32de8 R11: 0000000000000246 R12: 0000564f48ea8aa4 R13: 0000000000000001 R14: 0000564f48ea93e8 R15: 00007ffec5b32d68 Eric noted a probable shinfo->nr_frags corruption, which indeed occurs. The root cause is a buggy MPTCP option len computation in some circumstances: the ADD_ADDR option should be mutually exclusive with DSS since the blamed commit. Still, mptcp_established_options_add_addr() tries to set the relevant info in mptcp_out_options, if the remaining space is large enough even when DSS is present. Since the ADD_ADDR infos and the DSS share the same union fields, adding first corrupts the latter. In the worst-case scenario, such corruption increases the DSS binary layout, exceeding the computed length and possibly overwriting the skb shared info. Address the issue by enforcing mutual exclusion in mptcp_established_options_add_addr(), too. Cc: stable@vger.kernel.org Reported-by: syzbot+38a095a81f30d82884c1@syzkaller.appspotmail.com Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/538 Fixes: 1bff1e43a30e ("mptcp: optimize out option generation") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/025d9df8cde3c9a557befc47e9bc08fbbe3476e5.1734771049.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 1603b3702e22..a62bc874bf1e 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -667,8 +667,15 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * &echo, &drop_other_suboptions)) return false; + /* + * Later on, mptcp_write_options() will enforce mutually exclusion with + * DSS, bail out if such option is set and we can't drop it. + */ if (drop_other_suboptions) remaining += opt_size; + else if (opts->suboptions & OPTION_MPTCP_DSS) + return false; + len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port); if (remaining < len) return false; -- cgit v1.2.3 From 68e068cabd2c6c533ef934c2e5151609cf6ecc6d Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Wed, 1 Jan 2025 11:47:40 -0500 Subject: net: reenable NETIF_F_IPV6_CSUM offload for BIG TCP packets The blamed commit disabled hardware offoad of IPv6 packets with extension headers on devices that advertise NETIF_F_IPV6_CSUM, based on the definition of that feature in skbuff.h: * * - %NETIF_F_IPV6_CSUM * - Driver (device) is only able to checksum plain * TCP or UDP packets over IPv6. These are specifically * unencapsulated packets of the form IPv6|TCP or * IPv6|UDP where the Next Header field in the IPv6 * header is either TCP or UDP. IPv6 extension headers * are not supported with this feature. This feature * cannot be set in features for a device with * NETIF_F_HW_CSUM also set. This feature is being * DEPRECATED (see below). The change causes skb_warn_bad_offload to fire for BIG TCP packets. [ 496.310233] WARNING: CPU: 13 PID: 23472 at net/core/dev.c:3129 skb_warn_bad_offload+0xc4/0xe0 [ 496.310297] ? skb_warn_bad_offload+0xc4/0xe0 [ 496.310300] skb_checksum_help+0x129/0x1f0 [ 496.310303] skb_csum_hwoffload_help+0x150/0x1b0 [ 496.310306] validate_xmit_skb+0x159/0x270 [ 496.310309] validate_xmit_skb_list+0x41/0x70 [ 496.310312] sch_direct_xmit+0x5c/0x250 [ 496.310317] __qdisc_run+0x388/0x620 BIG TCP introduced an IPV6_TLV_JUMBO IPv6 extension header to communicate packet length, as this is an IPv6 jumbogram. But, the feature is only enabled on devices that support BIG TCP TSO. The header is only present for PF_PACKET taps like tcpdump, and not transmitted by physical devices. For this specific case of extension headers that are not transmitted, return to the situation before the blamed commit and support hardware offload. ipv6_has_hopopt_jumbo() tests not only whether this header is present, but also that it is the only extension header before a terminal (L4) header. Fixes: 04c20a9356f2 ("net: skip offload for NETIF_F_IPV6_CSUM if ipv6 header contains extension") Reported-by: syzbot Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89iK1hdC3Nt8KPhOtTF8vCPc1AHDCtse_BTNki1pWxAByTQ@mail.gmail.com/ Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250101164909.1331680-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 45a8c3dd4a64..faa23042df38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3642,8 +3642,10 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) goto sw_checksum; + switch (skb->csum_offset) { case offsetof(struct tcphdr, check): case offsetof(struct udphdr, check): -- cgit v1.2.3 From 5b0af621c3f6ef9261cf6067812f2fd9943acb4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Dec 2024 16:05:27 +0000 Subject: net: restrict SO_REUSEPORT to inet sockets After blamed commit, crypto sockets could accidentally be destroyed from RCU call back, as spotted by zyzbot [1]. Trying to acquire a mutex in RCU callback is not allowed. Restrict SO_REUSEPORT socket option to inet sockets. v1 of this patch supported TCP, UDP and SCTP sockets, but fcnal-test.sh test needed RAW and ICMP support. [1] BUG: sleeping function called from invalid context at kernel/locking/mutex.c:562 in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 24, name: ksoftirqd/1 preempt_count: 100, expected: 0 RCU nest depth: 0, expected: 0 1 lock held by ksoftirqd/1/24: #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_lock_acquire include/linux/rcupdate.h:337 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_do_batch kernel/rcu/tree.c:2561 [inline] #0: ffffffff8e937ba0 (rcu_callback){....}-{0:0}, at: rcu_core+0xa37/0x17a0 kernel/rcu/tree.c:2823 Preemption disabled at: [] softirq_handle_begin kernel/softirq.c:402 [inline] [] handle_softirqs+0x128/0x9b0 kernel/softirq.c:537 CPU: 1 UID: 0 PID: 24 Comm: ksoftirqd/1 Not tainted 6.13.0-rc3-syzkaller-00174-ga024e377efed #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 __might_resched+0x5d4/0x780 kernel/sched/core.c:8758 __mutex_lock_common kernel/locking/mutex.c:562 [inline] __mutex_lock+0x131/0xee0 kernel/locking/mutex.c:735 crypto_put_default_null_skcipher+0x18/0x70 crypto/crypto_null.c:179 aead_release+0x3d/0x50 crypto/algif_aead.c:489 alg_do_release crypto/af_alg.c:118 [inline] alg_sock_destruct+0x86/0xc0 crypto/af_alg.c:502 __sk_destruct+0x58/0x5f0 net/core/sock.c:2260 rcu_do_batch kernel/rcu/tree.c:2567 [inline] rcu_core+0xaaa/0x17a0 kernel/rcu/tree.c:2823 handle_softirqs+0x2d4/0x9b0 kernel/softirq.c:561 run_ksoftirqd+0xca/0x130 kernel/softirq.c:950 smpboot_thread_fn+0x544/0xa30 kernel/smpboot.c:164 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Fixes: 8c7138b33e5c ("net: Unpublish sk from sk_reuseport_cb before call_rcu") Reported-by: syzbot+b3e02953598f447d4d2a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772f2f4.050a0220.2f3838.04cb.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Martin KaFai Lau Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241231160527.3994168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index 74729d20cd00..be84885f9290 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1295,7 +1295,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); break; case SO_REUSEPORT: - sk->sk_reuseport = valbool; + if (valbool && !sk_is_inet(sk)) + ret = -EOPNOTSUPP; + else + sk->sk_reuseport = valbool; break; case SO_DONTROUTE: sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); -- cgit v1.2.3 From 77ee7a6d16b6ec07b5c3ae2b6b60a24c1afbed09 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:03 +0000 Subject: af_packet: fix vlan_get_tci() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_tci() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8da482 len:32 put:14 head:ffff88807a1d5800 data:ffff88807a1d5810 tail:0x14 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 0 UID: 0 PID: 5880 Comm: syz-executor172 Not tainted 6.13.0-rc3-syzkaller-00762-g9268abe611b0 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 9e 6c 26 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 3a 5a 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc90003baf5b8 EFLAGS: 00010286 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 8565c1eec37aa000 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802616fb50 R08: ffffffff817f0a4c R09: 1ffff92000775e50 R10: dffffc0000000000 R11: fffff52000775e51 R12: 0000000000000140 R13: ffff88807a1d5800 R14: ffff88807a1d5810 R15: 0000000000000014 FS: 00007fa03261f6c0(0000) GS:ffff8880b8600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffd65753000 CR3: 0000000031720000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_tci+0x272/0x550 net/packet/af_packet.c:565 packet_recvmsg+0x13c9/0x1ef0 net/packet/af_packet.c:3616 sock_recvmsg_nosec net/socket.c:1044 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1066 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2814 ___sys_recvmsg net/socket.c:2856 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2951 __sys_recvmmsg net/socket.c:3025 [inline] __do_sys_recvmmsg net/socket.c:3048 [inline] __se_sys_recvmmsg net/socket.c:3041 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3041 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+8400677f3fd43f37d3bc@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c6.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/packet/af_packet.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 886c0dd47b66..e2e34a49e98d 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -538,10 +538,8 @@ static void *packet_current_frame(struct packet_sock *po, return packet_lookup_frame(po, rb, rb->head, status); } -static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) +static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; struct vlan_hdr vhdr, *vh; unsigned int header_len; @@ -562,12 +560,8 @@ static u16 vlan_get_tci(struct sk_buff *skb, struct net_device *dev) else return 0; - skb_push(skb, skb->data - skb_mac_header(skb)); - vh = skb_header_pointer(skb, header_len, sizeof(vhdr), &vhdr); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } + vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, + sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; -- cgit v1.2.3 From f91a5b8089389eb408501af2762f168c3aaa7b79 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:10:04 +0000 Subject: af_packet: fix vlan_get_protocol_dgram() vs MSG_PEEK Blamed commit forgot MSG_PEEK case, allowing a crash [1] as found by syzbot. Rework vlan_get_protocol_dgram() to not touch skb at all, so that it can be used from many cpus on the same skb. Add a const qualifier to skb argument. [1] skbuff: skb_under_panic: text:ffffffff8a8ccd05 len:29 put:14 head:ffff88807fc8e400 data:ffff88807fc8e3f4 tail:0x11 end:0x140 dev: ------------[ cut here ]------------ kernel BUG at net/core/skbuff.c:206 ! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 5892 Comm: syz-executor883 Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:skb_panic net/core/skbuff.c:206 [inline] RIP: 0010:skb_under_panic+0x14b/0x150 net/core/skbuff.c:216 Code: 0b 8d 48 c7 c6 86 d5 25 8e 48 8b 54 24 08 8b 0c 24 44 8b 44 24 04 4d 89 e9 50 41 54 41 57 41 56 e8 5a 69 79 f7 48 83 c4 20 90 <0f> 0b 0f 1f 00 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 RSP: 0018:ffffc900038d7638 EFLAGS: 00010282 RAX: 0000000000000087 RBX: dffffc0000000000 RCX: 609ffd18ea660600 RDX: 0000000000000000 RSI: 0000000080000000 RDI: 0000000000000000 RBP: ffff88802483c8d0 R08: ffffffff817f0a8c R09: 1ffff9200071ae60 R10: dffffc0000000000 R11: fffff5200071ae61 R12: 0000000000000140 R13: ffff88807fc8e400 R14: ffff88807fc8e3f4 R15: 0000000000000011 FS: 00007fbac5e006c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fbac5e00d58 CR3: 000000001238e000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: skb_push+0xe5/0x100 net/core/skbuff.c:2636 vlan_get_protocol_dgram+0x165/0x290 net/packet/af_packet.c:585 packet_recvmsg+0x948/0x1ef0 net/packet/af_packet.c:3552 sock_recvmsg_nosec net/socket.c:1033 [inline] sock_recvmsg+0x22f/0x280 net/socket.c:1055 ____sys_recvmsg+0x1c6/0x480 net/socket.c:2803 ___sys_recvmsg net/socket.c:2845 [inline] do_recvmmsg+0x426/0xab0 net/socket.c:2940 __sys_recvmmsg net/socket.c:3014 [inline] __do_sys_recvmmsg net/socket.c:3037 [inline] __se_sys_recvmmsg net/socket.c:3030 [inline] __x64_sys_recvmmsg+0x199/0x250 net/socket.c:3030 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 79eecf631c14 ("af_packet: Handle outgoing VLAN packets without hardware offloading") Reported-by: syzbot+74f70bb1cb968bf09e4f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c485.050a0220.2f3838.04c5.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Chengen Du Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20241230161004.2681892-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/if_vlan.h | 16 +++++++++++++--- net/packet/af_packet.c | 16 ++++------------ 2 files changed, 17 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h index c1645c86eed9..d65b5d71b93b 100644 --- a/include/linux/if_vlan.h +++ b/include/linux/if_vlan.h @@ -585,13 +585,16 @@ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol + * @mac_offset: MAC offset * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ -static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, - int *depth) +static inline __be16 __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset, + int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; @@ -610,7 +613,8 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, do { struct vlan_hdr vhdr, *vh; - vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; @@ -625,6 +629,12 @@ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, return type; } +static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, + int *depth) +{ + return __vlan_get_protocol_offset(skb, type, 0, depth); +} + /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index e2e34a49e98d..2d73769d67f4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -568,21 +568,13 @@ static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) return ntohs(vh->h_vlan_TCI); } -static __be16 vlan_get_protocol_dgram(struct sk_buff *skb) +static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; - if (unlikely(eth_type_vlan(proto))) { - u8 *skb_orig_data = skb->data; - int skb_orig_len = skb->len; - - skb_push(skb, skb->data - skb_mac_header(skb)); - proto = __vlan_get_protocol(skb, proto, NULL); - if (skb_orig_data != skb->data) { - skb->data = skb_orig_data; - skb->len = skb_orig_len; - } - } + if (unlikely(eth_type_vlan(proto))) + proto = __vlan_get_protocol_offset(skb, proto, + skb_mac_offset(skb), NULL); return proto; } -- cgit v1.2.3 From 260466b576bca0081a7d4acecc8e93687aa22d0e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Dec 2024 16:28:49 +0000 Subject: ila: serialize calls to nf_register_net_hooks() syzbot found a race in ila_add_mapping() [1] commit 031ae72825ce ("ila: call nf_unregister_net_hooks() sooner") attempted to fix a similar issue. Looking at the syzbot repro, we have concurrent ILA_CMD_ADD commands. Add a mutex to make sure at most one thread is calling nf_register_net_hooks(). [1] BUG: KASAN: slab-use-after-free in rht_key_hashfn include/linux/rhashtable.h:159 [inline] BUG: KASAN: slab-use-after-free in __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 Read of size 4 at addr ffff888028f40008 by task dhcpcd/5501 CPU: 1 UID: 0 PID: 5501 Comm: dhcpcd Not tainted 6.13.0-rc4-syzkaller-00054-gd6ef8b40d075 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x116/0x1f0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xc3/0x620 mm/kasan/report.c:489 kasan_report+0xd9/0x110 mm/kasan/report.c:602 rht_key_hashfn include/linux/rhashtable.h:159 [inline] __rhashtable_lookup.constprop.0+0x426/0x550 include/linux/rhashtable.h:604 rhashtable_lookup include/linux/rhashtable.h:646 [inline] rhashtable_lookup_fast include/linux/rhashtable.h:672 [inline] ila_lookup_wildcards net/ipv6/ila/ila_xlat.c:127 [inline] ila_xlat_addr net/ipv6/ila/ila_xlat.c:652 [inline] ila_nf_input+0x1ee/0x620 net/ipv6/ila/ila_xlat.c:185 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xbb/0x200 net/netfilter/core.c:626 nf_hook.constprop.0+0x42e/0x750 include/linux/netfilter.h:269 NF_HOOK include/linux/netfilter.h:312 [inline] ipv6_rcv+0xa4/0x680 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core+0x12e/0x1e0 net/core/dev.c:5672 __netif_receive_skb+0x1d/0x160 net/core/dev.c:5785 process_backlog+0x443/0x15f0 net/core/dev.c:6117 __napi_poll.constprop.0+0xb7/0x550 net/core/dev.c:6883 napi_poll net/core/dev.c:6952 [inline] net_rx_action+0xa94/0x1010 net/core/dev.c:7074 handle_softirqs+0x213/0x8f0 kernel/softirq.c:561 __do_softirq kernel/softirq.c:595 [inline] invoke_softirq kernel/softirq.c:435 [inline] __irq_exit_rcu+0x109/0x170 kernel/softirq.c:662 irq_exit_rcu+0x9/0x30 kernel/softirq.c:678 instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1049 [inline] sysvec_apic_timer_interrupt+0xa4/0xc0 arch/x86/kernel/apic/apic.c:1049 Fixes: 7f00feaf1076 ("ila: Add generic ILA translation facility") Reported-by: syzbot+47e761d22ecf745f72b9@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6772c9ae.050a0220.2f3838.04c7.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Florian Westphal Cc: Tom Herbert Link: https://patch.msgid.link/20241230162849.2795486-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv6/ila/ila_xlat.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index 7646e401c630..1d41b2ab4884 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -195,6 +195,8 @@ static const struct nf_hook_ops ila_nf_hook_ops[] = { }, }; +static DEFINE_MUTEX(ila_mutex); + static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) { struct ila_net *ilan = net_generic(net, ila_net_id); @@ -202,16 +204,20 @@ static int ila_add_mapping(struct net *net, struct ila_xlat_params *xp) spinlock_t *lock = ila_get_lock(ilan, xp->ip.locator_match); int err = 0, order; - if (!ilan->xlat.hooks_registered) { + if (!READ_ONCE(ilan->xlat.hooks_registered)) { /* We defer registering net hooks in the namespace until the * first mapping is added. */ - err = nf_register_net_hooks(net, ila_nf_hook_ops, - ARRAY_SIZE(ila_nf_hook_ops)); + mutex_lock(&ila_mutex); + if (!ilan->xlat.hooks_registered) { + err = nf_register_net_hooks(net, ila_nf_hook_ops, + ARRAY_SIZE(ila_nf_hook_ops)); + if (!err) + WRITE_ONCE(ilan->xlat.hooks_registered, true); + } + mutex_unlock(&ila_mutex); if (err) return err; - - ilan->xlat.hooks_registered = true; } ila = kzalloc(sizeof(*ila), GFP_KERNEL); -- cgit v1.2.3 From 449e6912a2522af672e99992e1201a454910864e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:30 +0100 Subject: mptcp: fix recvbuffer adjust on sleeping rcvmsg If the recvmsg() blocks after receiving some data - i.e. due to SO_RCVLOWAT - the MPTCP code will attempt multiple times to adjust the receive buffer size, wrongly accounting every time the cumulative of received data - instead of accounting only for the delta. Address the issue moving mptcp_rcv_space_adjust just after the data reception and passing it only the just received bytes. This also removes an unneeded difference between the TCP and MPTCP RX code path implementation. Fixes: 581302298524 ("mptcp: error out earlier on disconnect") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-1-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 08a72242428c..27afdb7e2071 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1939,6 +1939,8 @@ do_error: goto out; } +static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); + static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, struct msghdr *msg, size_t len, int flags, @@ -1992,6 +1994,7 @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, break; } + mptcp_rcv_space_adjust(msk, copied); return copied; } @@ -2268,7 +2271,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - mptcp_rcv_space_adjust(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2276,8 +2278,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } - mptcp_rcv_space_adjust(msk, copied); - out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) -- cgit v1.2.3 From 551844f26da2a9f76c0a698baaffa631d1178645 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:31 +0100 Subject: mptcp: don't always assume copied data in mptcp_cleanup_rbuf() Under some corner cases the MPTCP protocol can end-up invoking mptcp_cleanup_rbuf() when no data has been copied, but such helper assumes the opposite condition. Explicitly drop such assumption and performs the costly call only when strictly needed - before releasing the msk socket lock. Fixes: fd8976790a6c ("mptcp: be careful on MPTCP-level ack.") Cc: stable@vger.kernel.org Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-2-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 27afdb7e2071..5307fff9d995 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -528,13 +528,13 @@ static void mptcp_send_ack(struct mptcp_sock *msk) mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } -static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) +static void mptcp_subflow_cleanup_rbuf(struct sock *ssk, int copied) { bool slow; slow = lock_sock_fast(ssk); if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); + tcp_cleanup_rbuf(ssk, copied); unlock_sock_fast(ssk, slow); } @@ -551,7 +551,7 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty) (ICSK_ACK_PUSHED2 | ICSK_ACK_PUSHED))); } -static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) +static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) { int old_space = READ_ONCE(msk->old_wspace); struct mptcp_subflow_context *subflow; @@ -559,14 +559,14 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk) int space = __mptcp_space(sk); bool cleanup, rx_empty; - cleanup = (space > 0) && (space >= (old_space << 1)); - rx_empty = !__mptcp_rmem(sk); + cleanup = (space > 0) && (space >= (old_space << 1)) && copied; + rx_empty = !__mptcp_rmem(sk) && copied; mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (cleanup || mptcp_subflow_could_cleanup(ssk, rx_empty)) - mptcp_subflow_cleanup_rbuf(ssk); + mptcp_subflow_cleanup_rbuf(ssk, copied); } } @@ -2220,9 +2220,6 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - /* be sure to advertise window change */ - mptcp_cleanup_rbuf(msk); - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) continue; @@ -2271,6 +2268,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); + mptcp_cleanup_rbuf(msk, copied); err = sk_wait_data(sk, &timeo, NULL); if (err < 0) { err = copied ? : err; @@ -2278,6 +2276,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } } + mptcp_cleanup_rbuf(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) -- cgit v1.2.3 From 56b824eb49d6258aa0bad09a406ceac3f643cdae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 30 Dec 2024 19:12:32 +0100 Subject: mptcp: prevent excessive coalescing on receive Currently the skb size after coalescing is only limited by the skb layout (the skb must not carry frag_list). A single coalesced skb covering several MSS can potentially fill completely the receive buffer. In such a case, the snd win will zero until the receive buffer will be empty again, affecting tput badly. Fixes: 8268ed4c9d19 ("mptcp: introduce and use mptcp_try_coalesce()") Cc: stable@vger.kernel.org # please delay 2 weeks after 6.13-final release Signed-off-by: Paolo Abeni Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241230-net-mptcp-rbuf-fixes-v1-3-8608af434ceb@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5307fff9d995..1b2e7cbb577f 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -136,6 +136,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, int delta; if (MPTCP_SKB_CB(from)->offset || + ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || !skb_try_coalesce(to, from, &fragstolen, &delta)) return false; -- cgit v1.2.3 From 1e9b0e1c550c42c13c111d1a31e822057232abc4 Mon Sep 17 00:00:00 2001 From: Antonio Pastor Date: Thu, 2 Jan 2025 20:23:00 -0500 Subject: net: 802: LLC+SNAP OID:PID lookup on start of skb data 802.2+LLC+SNAP frames received by napi_complete_done() with GRO and DSA have skb->transport_header set two bytes short, or pointing 2 bytes before network_header & skb->data. This was an issue as snap_rcv() expected offset to point to SNAP header (OID:PID), causing packet to be dropped. A fix at llc_fixup_skb() (a024e377efed) resets transport_header for any LLC consumers that may care about it, and stops SNAP packets from being dropped, but doesn't fix the problem which is that LLC and SNAP should not use transport_header offset. Ths patch eliminates the use of transport_header offset for SNAP lookup of OID:PID so that SNAP does not rely on the offset at all. The offset is reset after pull for any SNAP packet consumers that may (but shouldn't) use it. Fixes: fda55eca5a33 ("net: introduce skb_transport_header_was_set()") Signed-off-by: Antonio Pastor Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103012303.746521-1-antonio.pastor@gmail.com Signed-off-by: Jakub Kicinski --- net/802/psnap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/802/psnap.c b/net/802/psnap.c index fca9d454905f..389df460c8c4 100644 --- a/net/802/psnap.c +++ b/net/802/psnap.c @@ -55,11 +55,11 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev, goto drop; rcu_read_lock(); - proto = find_snap_client(skb_transport_header(skb)); + proto = find_snap_client(skb->data); if (proto) { /* Pass the frame on. */ - skb->transport_header += 5; skb_pull_rcsum(skb, 5); + skb_reset_transport_header(skb); rc = proto->rcvfunc(skb, dev, &snap_packet_type, orig_dev); } rcu_read_unlock(); -- cgit v1.2.3 From a039e54397c6a75b713b9ce7894a62e06956aa92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Jan 2025 10:45:46 +0000 Subject: net_sched: cls_flow: validate TCA_FLOW_RSHIFT attribute syzbot found that TCA_FLOW_RSHIFT attribute was not validated. Right shitfing a 32bit integer is undefined for large shift values. UBSAN: shift-out-of-bounds in net/sched/cls_flow.c:329:23 shift exponent 9445 is too large for 32-bit type 'u32' (aka 'unsigned int') CPU: 1 UID: 0 PID: 54 Comm: kworker/u8:3 Not tainted 6.13.0-rc3-syzkaller-00180-g4f619d518db9 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 Workqueue: ipv6_addrconf addrconf_dad_work Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:231 [inline] __ubsan_handle_shift_out_of_bounds+0x3c8/0x420 lib/ubsan.c:468 flow_classify+0x24d5/0x25b0 net/sched/cls_flow.c:329 tc_classify include/net/tc_wrapper.h:197 [inline] __tcf_classify net/sched/cls_api.c:1771 [inline] tcf_classify+0x420/0x1160 net/sched/cls_api.c:1867 sfb_classify net/sched/sch_sfb.c:260 [inline] sfb_enqueue+0x3ad/0x18b0 net/sched/sch_sfb.c:318 dev_qdisc_enqueue+0x4b/0x290 net/core/dev.c:3793 __dev_xmit_skb net/core/dev.c:3889 [inline] __dev_queue_xmit+0xf0e/0x3f50 net/core/dev.c:4400 dev_queue_xmit include/linux/netdevice.h:3168 [inline] neigh_hh_output include/net/neighbour.h:523 [inline] neigh_output include/net/neighbour.h:537 [inline] ip_finish_output2+0xd41/0x1390 net/ipv4/ip_output.c:236 iptunnel_xmit+0x55d/0x9b0 net/ipv4/ip_tunnel_core.c:82 udp_tunnel_xmit_skb+0x262/0x3b0 net/ipv4/udp_tunnel_core.c:173 geneve_xmit_skb drivers/net/geneve.c:916 [inline] geneve_xmit+0x21dc/0x2d00 drivers/net/geneve.c:1039 __netdev_start_xmit include/linux/netdevice.h:5002 [inline] netdev_start_xmit include/linux/netdevice.h:5011 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x27a/0x7d0 net/core/dev.c:3606 __dev_queue_xmit+0x1b73/0x3f50 net/core/dev.c:4434 Fixes: e5dfb815181f ("[NET_SCHED]: Add flow classifier") Reported-by: syzbot+1dbb57d994e54aaa04d2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/6777bf49.050a0220.178762.0040.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250103104546.3714168-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flow.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 5502998aace7..5c2580a07530 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -356,7 +356,8 @@ static const struct nla_policy flow_policy[TCA_FLOW_MAX + 1] = { [TCA_FLOW_KEYS] = { .type = NLA_U32 }, [TCA_FLOW_MODE] = { .type = NLA_U32 }, [TCA_FLOW_BASECLASS] = { .type = NLA_U32 }, - [TCA_FLOW_RSHIFT] = { .type = NLA_U32 }, + [TCA_FLOW_RSHIFT] = NLA_POLICY_MAX(NLA_U32, + 31 /* BITS_PER_U32 - 1 */), [TCA_FLOW_ADDEND] = { .type = NLA_U32 }, [TCA_FLOW_MASK] = { .type = NLA_U32 }, [TCA_FLOW_XOR] = { .type = NLA_U32 }, -- cgit v1.2.3 From fd48f071a3d6d51e737e953bb43fe69785cf59a9 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 3 Jan 2025 10:32:07 -0800 Subject: net: don't dump Tx and uninitialized NAPIs We use NAPI ID as the key for continuing dumps. We also depend on the NAPIs being sorted by ID within the driver list. Tx NAPIs (which don't have an ID assigned) break this expectation, it's not currently possible to dump them reliably. Since Tx NAPIs are relatively rare, and can't be used in doit (GET or SET) hide them from the dump API as well. Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Signed-off-by: Jakub Kicinski Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250103183207.1216004-1-kuba@kernel.org Signed-off-by: Paolo Abeni --- net/core/netdev-genl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index b0772d135efb..125b660004d3 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -176,8 +176,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, if (!hdr) return -EMSGSIZE; - if (napi->napi_id >= MIN_NAPI_ID && - nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) + if (nla_put_u32(rsp, NETDEV_A_NAPI_ID, napi->napi_id)) goto nla_put_failure; if (nla_put_u32(rsp, NETDEV_A_NAPI_IFINDEX, napi->dev->ifindex)) @@ -272,6 +271,8 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp, return err; list_for_each_entry(napi, &netdev->napi_list, dev_list) { + if (napi->napi_id < MIN_NAPI_ID) + continue; if (ctx->napi_id && napi->napi_id >= ctx->napi_id) continue; -- cgit v1.2.3 From b341ca51d2679829d26a3f6a4aa9aee9abd94f92 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Sat, 4 Jan 2025 10:29:45 -0500 Subject: tls: Fix tls_sw_sendmsg error handling We've noticed that NFS can hang when using RPC over TLS on an unstable connection, and investigation shows that the RPC layer is stuck in a tight loop attempting to transmit, but forever getting -EBADMSG back from the underlying network. The loop begins when tcp_sendmsg_locked() returns -EPIPE to tls_tx_records(), but that error is converted to -EBADMSG when calling the socket's error reporting handler. Instead of converting errors from tcp_sendmsg_locked(), let's pass them along in this path. The RPC layer handles -EPIPE by reconnecting the transport, which prevents the endless attempts to transmit on a broken connection. Signed-off-by: Benjamin Coddington Fixes: a42055e8d2c3 ("net/tls: Add support for async encryption of records for performance") Link: https://patch.msgid.link/9594185559881679d81f071b181a10eb07cd079f.1736004079.git.bcodding@redhat.com Signed-off-by: Jakub Kicinski --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index bbf26cc4f6ee..7bcc9b4408a2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -458,7 +458,7 @@ int tls_tx_records(struct sock *sk, int flags) tx_err: if (rc < 0 && rc != -EAGAIN) - tls_err_abort(sk, -EBADMSG); + tls_err_abort(sk, rc); return rc; } -- cgit v1.2.3 From cb358ff94154774d031159b018adf45e17673941 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 6 Jan 2025 16:19:11 +0900 Subject: ipvlan: Fix use-after-free in ipvlan_get_iflink(). syzbot presented an use-after-free report [0] regarding ipvlan and linkwatch. ipvlan does not hold a refcnt of the lower device unlike vlan and macvlan. If the linkwatch work is triggered for the ipvlan dev, the lower dev might have already been freed, resulting in UAF of ipvlan->phy_dev in ipvlan_get_iflink(). We can delay the lower dev unregistration like vlan and macvlan by holding the lower dev's refcnt in dev->netdev_ops->ndo_init() and releasing it in dev->priv_destructor(). Jakub pointed out calling .ndo_XXX after unregister_netdevice() has returned is error prone and suggested [1] addressing this UAF in the core by taking commit 750e51603395 ("net: avoid potential UAF in default_operstate()") further. Let's assume unregistering devices DOWN and use RCU protection in default_operstate() not to race with the device unregistration. [0]: BUG: KASAN: slab-use-after-free in ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 Read of size 4 at addr ffff0000d768c0e0 by task kworker/u8:35/6944 CPU: 0 UID: 0 PID: 6944 Comm: kworker/u8:35 Not tainted 6.13.0-rc2-g9bc5c9515b48 #12 4c3cb9e8b4565456f6a355f312ff91f4f29b3c47 Hardware name: linux,dummy-virt (DT) Workqueue: events_unbound linkwatch_event Call trace: show_stack+0x38/0x50 arch/arm64/kernel/stacktrace.c:484 (C) __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0xbc/0x108 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0x16c/0x6f0 mm/kasan/report.c:489 kasan_report+0xc0/0x120 mm/kasan/report.c:602 __asan_report_load4_noabort+0x20/0x30 mm/kasan/report_generic.c:380 ipvlan_get_iflink+0x84/0x88 drivers/net/ipvlan/ipvlan_main.c:353 dev_get_iflink+0x7c/0xd8 net/core/dev.c:674 default_operstate net/core/link_watch.c:45 [inline] rfc2863_policy+0x144/0x360 net/core/link_watch.c:72 linkwatch_do_dev+0x60/0x228 net/core/link_watch.c:175 __linkwatch_run_queue+0x2f4/0x5b8 net/core/link_watch.c:239 linkwatch_event+0x64/0xa8 net/core/link_watch.c:282 process_one_work+0x700/0x1398 kernel/workqueue.c:3229 process_scheduled_works kernel/workqueue.c:3310 [inline] worker_thread+0x8c4/0xe10 kernel/workqueue.c:3391 kthread+0x2b0/0x360 kernel/kthread.c:389 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:862 Allocated by task 9303: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x58 mm/kasan/generic.c:568 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x84/0xa0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4283 [inline] __kmalloc_node_noprof+0x2a0/0x560 mm/slub.c:4289 __kvmalloc_node_noprof+0x9c/0x230 mm/util.c:650 alloc_netdev_mqs+0xb4/0x1118 net/core/dev.c:11209 rtnl_create_link+0x2b8/0xb60 net/core/rtnetlink.c:3595 rtnl_newlink_create+0x19c/0x868 net/core/rtnetlink.c:3771 __rtnl_newlink net/core/rtnetlink.c:3896 [inline] rtnl_newlink+0x122c/0x15c0 net/core/rtnetlink.c:4011 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] __sys_sendto+0x2ec/0x438 net/socket.c:2197 __do_sys_sendto net/socket.c:2204 [inline] __se_sys_sendto net/socket.c:2200 [inline] __arm64_sys_sendto+0xe4/0x110 net/socket.c:2200 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 Freed by task 10200: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x30/0x68 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:582 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x48/0x68 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2338 [inline] slab_free mm/slub.c:4598 [inline] kfree+0x140/0x420 mm/slub.c:4746 kvfree+0x4c/0x68 mm/util.c:693 netdev_release+0x94/0xc8 net/core/net-sysfs.c:2034 device_release+0x98/0x1c0 kobject_cleanup lib/kobject.c:689 [inline] kobject_release lib/kobject.c:720 [inline] kref_put include/linux/kref.h:65 [inline] kobject_put+0x2b0/0x438 lib/kobject.c:737 netdev_run_todo+0xdd8/0xf48 net/core/dev.c:10924 rtnl_unlock net/core/rtnetlink.c:152 [inline] rtnl_net_unlock net/core/rtnetlink.c:209 [inline] rtnl_dellink+0x484/0x680 net/core/rtnetlink.c:3526 rtnetlink_rcv_msg+0x61c/0x918 net/core/rtnetlink.c:6901 netlink_rcv_skb+0x1dc/0x398 net/netlink/af_netlink.c:2542 rtnetlink_rcv+0x34/0x50 net/core/rtnetlink.c:6928 netlink_unicast_kernel net/netlink/af_netlink.c:1321 [inline] netlink_unicast+0x618/0x838 net/netlink/af_netlink.c:1347 netlink_sendmsg+0x5fc/0x8b0 net/netlink/af_netlink.c:1891 sock_sendmsg_nosec net/socket.c:711 [inline] __sock_sendmsg net/socket.c:726 [inline] ____sys_sendmsg+0x410/0x708 net/socket.c:2583 ___sys_sendmsg+0x178/0x1d8 net/socket.c:2637 __sys_sendmsg net/socket.c:2669 [inline] __do_sys_sendmsg net/socket.c:2674 [inline] __se_sys_sendmsg net/socket.c:2672 [inline] __arm64_sys_sendmsg+0x12c/0x1c8 net/socket.c:2672 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x90/0x278 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x13c/0x250 arch/arm64/kernel/syscall.c:132 do_el0_svc+0x54/0x70 arch/arm64/kernel/syscall.c:151 el0_svc+0x4c/0xa8 arch/arm64/kernel/entry-common.c:744 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:762 el0t_64_sync+0x198/0x1a0 arch/arm64/kernel/entry.S:600 The buggy address belongs to the object at ffff0000d768c000 which belongs to the cache kmalloc-cg-4k of size 4096 The buggy address is located 224 bytes inside of freed 4096-byte region [ffff0000d768c000, ffff0000d768d000) The buggy address belongs to the physical page: page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x117688 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 memcg:ffff0000c77ef981 flags: 0xbfffe0000000040(head|node=0|zone=2|lastcpupid=0x1ffff) page_type: f5(slab) raw: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 raw: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000040 ffff0000c000f500 dead000000000100 dead000000000122 head: 0000000000000000 0000000000040004 00000001f5000000 ffff0000c77ef981 head: 0bfffe0000000003 fffffdffc35da201 ffffffffffffffff 0000000000000000 head: 0000000000000008 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff0000d768bf80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff0000d768c000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff0000d768c080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff0000d768c100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff0000d768c180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 8c55facecd7a ("net: linkwatch: only report IF_OPER_LOWERLAYERDOWN if iflink is actually down") Reported-by: syzkaller Suggested-by: Jakub Kicinski Link: https://lore.kernel.org/netdev/20250102174400.085fd8ac@kernel.org/ [1] Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250106071911.64355-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/core/link_watch.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 1b4d39e38084..cb04ef2b9807 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -42,14 +42,18 @@ static unsigned int default_operstate(const struct net_device *dev) * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { - int iflink = dev_get_iflink(dev); struct net_device *peer; + int iflink; /* If called from netdev_run_todo()/linkwatch_sync_dev(), * dev_net(dev) can be already freed, and RTNL is not held. */ - if (dev->reg_state == NETREG_UNREGISTERED || - iflink == dev->ifindex) + if (dev->reg_state <= NETREG_REGISTERED) + iflink = dev_get_iflink(dev); + else + iflink = dev->ifindex; + + if (iflink == dev->ifindex) return IF_OPER_DOWN; ASSERT_RTNL(); -- cgit v1.2.3 From c2994b008492db033d40bd767be1620229a3035e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:09 -0500 Subject: Bluetooth: hci_sync: Fix not setting Random Address when required This fixes errors such as the following when Own address type is set to Random Address but it has not been programmed yet due to either be advertising or connecting: < HCI Command: LE Set Exte.. (0x08|0x0041) plen 13 Own address type: Random (0x03) Filter policy: Ignore not in accept list (0x01) PHYs: 0x05 Entry 0: LE 1M Type: Passive (0x00) Interval: 60.000 msec (0x0060) Window: 30.000 msec (0x0030) Entry 1: LE Coded Type: Passive (0x00) Interval: 180.000 msec (0x0120) Window: 90.000 msec (0x0090) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1 Status: Success (0x00) < HCI Command: LE Set Exten.. (0x08|0x0042) plen 6 Extended scan: Enabled (0x01) Filter duplicates: Enabled (0x01) Duration: 0 msec (0x0000) Period: 0.00 sec (0x0000) > HCI Event: Command Complete (0x0e) plen 4 LE Set Extended Scan Enable (0x08|0x0042) ncmd 1 Status: Invalid HCI Command Parameters (0x12) Fixes: c45074d68a9b ("Bluetooth: Fix not generating RPA when required") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_sync.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index c86f4e42e69c..7b2b04d6b856 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1031,9 +1031,9 @@ static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) { - /* If we're advertising or initiating an LE connection we can't - * go ahead and change the random address at this time. This is - * because the eventual initiator address used for the + /* If a random_addr has been set we're advertising or initiating an LE + * connection we can't go ahead and change the random address at this + * time. This is because the eventual initiator address used for the * subsequently created connection will be undefined (some * controllers use the new address and others the one we had * when the operation started). @@ -1041,8 +1041,9 @@ static int hci_set_random_addr_sync(struct hci_dev *hdev, bdaddr_t *rpa) * In this kind of scenario skip the update and let the random * address be updated at the next cycle. */ - if (hci_dev_test_flag(hdev, HCI_LE_ADV) || - hci_lookup_le_connect(hdev)) { + if (bacmp(&hdev->random_addr, BDADDR_ANY) && + (hci_dev_test_flag(hdev, HCI_LE_ADV) || + hci_lookup_le_connect(hdev))) { bt_dev_dbg(hdev, "Deferring random address update"); hci_dev_set_flag(hdev, HCI_RPA_EXPIRED); return 0; -- cgit v1.2.3 From a182d9c84f9c52fb5db895ecceeee8b3a1bf661e Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 25 Nov 2024 15:42:10 -0500 Subject: Bluetooth: MGMT: Fix Add Device to responding before completing Add Device with LE type requires updating resolving/accept list which requires quite a number of commands to complete and each of them may fail, so instead of pretending it would always work this checks the return of hci_update_passive_scan_sync which indicates if everything worked as intended. Fixes: e8907f76544f ("Bluetooth: hci_sync: Make use of hci_cmd_sync_queue set 3") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index b31192d473d0..de47ad999d7b 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7655,6 +7655,24 @@ static void device_added(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk); } +static void add_device_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_add_device *cp = cmd->param; + + if (!err) { + device_added(cmd->sk, hdev, &cp->addr.bdaddr, cp->addr.type, + cp->action); + device_flags_changed(NULL, hdev, &cp->addr.bdaddr, + cp->addr.type, hdev->conn_flags, + PTR_UINT(cmd->user_data)); + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_ADD_DEVICE, + mgmt_status(err), &cp->addr, sizeof(cp->addr)); + mgmt_pending_free(cmd); +} + static int add_device_sync(struct hci_dev *hdev, void *data) { return hci_update_passive_scan_sync(hdev); @@ -7663,6 +7681,7 @@ static int add_device_sync(struct hci_dev *hdev, void *data) static int add_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { + struct mgmt_pending_cmd *cmd; struct mgmt_cp_add_device *cp = data; u8 auto_conn, addr_type; struct hci_conn_params *params; @@ -7743,9 +7762,24 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, current_flags = params->flags; } - err = hci_cmd_sync_queue(hdev, add_device_sync, NULL, NULL); - if (err < 0) + cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); + if (!cmd) { + err = -ENOMEM; goto unlock; + } + + cmd->user_data = UINT_PTR(current_flags); + + err = hci_cmd_sync_queue(hdev, add_device_sync, cmd, + add_device_complete); + if (err < 0) { + err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, + MGMT_STATUS_FAILED, &cp->addr, + sizeof(cp->addr)); + mgmt_pending_free(cmd); + } + + goto unlock; added: device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action); -- cgit v1.2.3 From 67dba2c28fe0af7e25ea1aeade677162ed05310a Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Wed, 8 Jan 2025 17:50:28 +0800 Subject: Bluetooth: btmtk: Fix failed to send func ctrl for MediaTek devices. Use usb_autopm_get_interface() and usb_autopm_put_interface() in btmtk_usb_shutdown(), it could send func ctrl after enabling autosuspend. Bluetooth: btmtk_usb_hci_wmt_sync() hci0: Execution of wmt command timed out Bluetooth: btmtk_usb_shutdown() hci0: Failed to send wmt func ctrl (-110) Fixes: 5c5e8c52e3ca ("Bluetooth: btmtk: move btusb_mtk_[setup, shutdown] to btmtk.c") Signed-off-by: Chris Lu Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btmtk.c | 7 +++++++ net/bluetooth/rfcomm/tty.c | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c index 7fd9d5ddce02..224eafc27dbe 100644 --- a/drivers/bluetooth/btmtk.c +++ b/drivers/bluetooth/btmtk.c @@ -1472,10 +1472,15 @@ EXPORT_SYMBOL_GPL(btmtk_usb_setup); int btmtk_usb_shutdown(struct hci_dev *hdev) { + struct btmtk_data *data = hci_get_priv(hdev); struct btmtk_hci_wmt_params wmt_params; u8 param = 0; int err; + err = usb_autopm_get_interface(data->intf); + if (err < 0) + return err; + /* Disable the device */ wmt_params.op = BTMTK_WMT_FUNC_CTRL; wmt_params.flag = 0; @@ -1486,9 +1491,11 @@ int btmtk_usb_shutdown(struct hci_dev *hdev) err = btmtk_usb_hci_wmt_sync(hdev, &wmt_params); if (err < 0) { bt_dev_err(hdev, "Failed to send wmt func ctrl (%d)", err); + usb_autopm_put_interface(data->intf); return err; } + usb_autopm_put_interface(data->intf); return 0; } EXPORT_SYMBOL_GPL(btmtk_usb_shutdown); diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index af80d599c337..21a5b5535ebc 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -201,14 +201,14 @@ static ssize_t address_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%pMR\n", &dev->dst); + return sysfs_emit(buf, "%pMR\n", &dev->dst); } static ssize_t channel_show(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - return sprintf(buf, "%d\n", dev->channel); + return sysfs_emit(buf, "%d\n", dev->channel); } static DEVICE_ATTR_RO(address); -- cgit v1.2.3 From d1cacd74776895f6435941f86a1130e58f6dd226 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 6 Jan 2025 10:01:36 -0800 Subject: netdev: prevent accessing NAPI instances from another namespace The NAPI IDs were not fully exposed to user space prior to the netlink API, so they were never namespaced. The netlink API must ensure that at the very least NAPI instance belongs to the same netns as the owner of the genl sock. napi_by_id() can become static now, but it needs to move because of dev_get_by_napi_id(). Cc: stable@vger.kernel.org Fixes: 1287c1ae0fc2 ("netdev-genl: Support setting per-NAPI config values") Fixes: 27f91aaf49b3 ("netdev-genl: Add netlink framework functions for napi") Reviewed-by: Sridhar Samudrala Reviewed-by: Joe Damato Link: https://patch.msgid.link/20250106180137.1861472-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/dev.c | 43 ++++++++++++++++++++++++++++++------------- net/core/dev.h | 3 ++- net/core/netdev-genl.c | 6 ++---- 3 files changed, 34 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index faa23042df38..a9f62f5aeb84 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -753,6 +753,36 @@ int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, } EXPORT_SYMBOL_GPL(dev_fill_forward_path); +/* must be called under rcu_read_lock(), as we dont take a reference */ +static struct napi_struct *napi_by_id(unsigned int napi_id) +{ + unsigned int hash = napi_id % HASH_SIZE(napi_hash); + struct napi_struct *napi; + + hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) + if (napi->napi_id == napi_id) + return napi; + + return NULL; +} + +/* must be called under rcu_read_lock(), as we dont take a reference */ +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id) +{ + struct napi_struct *napi; + + napi = napi_by_id(napi_id); + if (!napi) + return NULL; + + if (WARN_ON_ONCE(!napi->dev)) + return NULL; + if (!net_eq(net, dev_net(napi->dev))) + return NULL; + + return napi; +} + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace @@ -6293,19 +6323,6 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -/* must be called under rcu_read_lock(), as we dont take a reference */ -struct napi_struct *napi_by_id(unsigned int napi_id) -{ - unsigned int hash = napi_id % HASH_SIZE(napi_hash); - struct napi_struct *napi; - - hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node) - if (napi->napi_id == napi_id) - return napi; - - return NULL; -} - static void skb_defer_free_flush(struct softnet_data *sd) { struct sk_buff *skb, *next; diff --git a/net/core/dev.h b/net/core/dev.h index d043dee25a68..deb5eae5749f 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -22,6 +22,8 @@ struct sd_flow_limit { extern int netdev_flow_limit_table_len; +struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id); + #ifdef CONFIG_PROC_FS int __init dev_proc_init(void); #else @@ -269,7 +271,6 @@ void xdp_do_check_flushed(struct napi_struct *napi); static inline void xdp_do_check_flushed(struct napi_struct *napi) { } #endif -struct napi_struct *napi_by_id(unsigned int napi_id); void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 125b660004d3..a3bdaf075b6b 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -167,8 +167,6 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi, void *hdr; pid_t pid; - if (WARN_ON_ONCE(!napi->dev)) - return -EINVAL; if (!(napi->dev->flags & IFF_UP)) return 0; @@ -234,7 +232,7 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_fill_one(rsp, napi, info); } else { @@ -355,7 +353,7 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); rcu_read_lock(); - napi = napi_by_id(napi_id); + napi = netdev_napi_by_id(genl_info_net(info), napi_id); if (napi) { err = netdev_nl_napi_set_config(napi, info); } else { -- cgit v1.2.3 From 80fb40baba19e25a1b6f3ecff6fc5c0171806bde Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 7 Jan 2025 11:14:39 +0100 Subject: tcp: Annotate data-race around sk->sk_mark in tcp_v4_send_reset This is a follow-up to 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark"). sk->sk_mark can be read and written without holding the socket lock. IPv6 equivalent is already covered with READ_ONCE() annotation in tcp_v6_send_response(). Fixes: 3c5b4d69c358 ("net: annotate data-races around sk->sk_mark") Signed-off-by: Daniel Borkmann Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/f459d1fc44f205e13f6d8bdca2c8bfb9902ffac9.1736244569.git.daniel@iogearbox.net Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a38c8b1f44db..c26f6c4b7bb4 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -896,7 +896,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, sock_net_set(ctl_sk, net); if (sk) { ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ? - inet_twsk(sk)->tw_mark : sk->sk_mark; + inet_twsk(sk)->tw_mark : READ_ONCE(sk->sk_mark); ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ? inet_twsk(sk)->tw_priority : READ_ONCE(sk->sk_priority); transmit_time = tcp_transmit_time(sk); -- cgit v1.2.3 From 13210fc63f353fe78584048079343413a3cdf819 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 2 Jan 2025 13:01:13 +0100 Subject: netfilter: nf_tables: imbalance in flowtable binding All these cases cause imbalance between BIND and UNBIND calls: - Delete an interface from a flowtable with multiple interfaces - Add a (device to a) flowtable with --check flag - Delete a netns containing a flowtable - In an interactive nft session, create a table with owner flag and flowtable inside, then quit. Fix it by calling FLOW_BLOCK_UNBIND when unregistering hooks, then remove late FLOW_BLOCK_UNBIND call when destroying flowtable. Fixes: ff4bf2f42a40 ("netfilter: nf_tables: add nft_unregister_flowtable_hook()") Reported-by: Phil Sutter Tested-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0b9f1e8dfe49..c4af283356e7 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -8822,6 +8822,7 @@ static void nft_unregister_flowtable_hook(struct net *net, } static void __nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list, bool release_netdev) { @@ -8829,6 +8830,8 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, list_for_each_entry_safe(hook, next, hook_list, list) { nf_unregister_net_hook(net, &hook->ops); + flowtable->data.type->setup(&flowtable->data, hook->ops.dev, + FLOW_BLOCK_UNBIND); if (release_netdev) { list_del(&hook->list); kfree_rcu(hook, rcu); @@ -8837,9 +8840,10 @@ static void __nft_unregister_flowtable_net_hooks(struct net *net, } static void nft_unregister_flowtable_net_hooks(struct net *net, + struct nft_flowtable *flowtable, struct list_head *hook_list) { - __nft_unregister_flowtable_net_hooks(net, hook_list, false); + __nft_unregister_flowtable_net_hooks(net, flowtable, hook_list, false); } static int nft_register_flowtable_net_hooks(struct net *net, @@ -9481,8 +9485,6 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) flowtable->data.type->free(&flowtable->data); list_for_each_entry_safe(hook, next, &flowtable->hook_list, list) { - flowtable->data.type->setup(&flowtable->data, hook->ops.dev, - FLOW_BLOCK_UNBIND); list_del_rcu(&hook->list); kfree_rcu(hook, rcu); } @@ -10870,6 +10872,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) &nft_trans_flowtable_hooks(trans), trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { list_del_rcu(&nft_trans_flowtable(trans)->list); @@ -10878,6 +10881,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) NULL, trans->msg_type); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11140,11 +11144,13 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) case NFT_MSG_NEWFLOWTABLE: if (nft_trans_flowtable_update(trans)) { nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable_hooks(trans)); } else { nft_use_dec_restore(&table->use); list_del_rcu(&nft_trans_flowtable(trans)->list); nft_unregister_flowtable_net_hooks(net, + nft_trans_flowtable(trans), &nft_trans_flowtable(trans)->hook_list); } break; @@ -11737,7 +11743,8 @@ static void __nft_release_hook(struct net *net, struct nft_table *table) list_for_each_entry(chain, &table->chains, list) __nf_tables_unregister_hook(net, table, chain, true); list_for_each_entry(flowtable, &table->flowtables, list) - __nft_unregister_flowtable_net_hooks(net, &flowtable->hook_list, + __nft_unregister_flowtable_net_hooks(net, flowtable, + &flowtable->hook_list, true); } -- cgit v1.2.3 From b541ba7d1f5a5b7b3e2e22dc9e40e18a7d6dbc13 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 8 Jan 2025 22:56:33 +0100 Subject: netfilter: conntrack: clamp maximum hashtable size to INT_MAX Use INT_MAX as maximum size for the conntrack hashtable. Otherwise, it is possible to hit WARN_ON_ONCE in __kvmalloc_node_noprof() when resizing hashtable because __GFP_NOWARN is unset. See: 0708a0afe291 ("mm: Consider __GFP_NOWARN flag for oversized kvmalloc() calls") Note: hashtable resize is only possible from init_netns. Fixes: 9cc1c73ad666 ("netfilter: conntrack: avoid integer overflow when resizing") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9db3e2b0b1c3..456446d7af20 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2517,12 +2517,15 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls) struct hlist_nulls_head *hash; unsigned int nr_slots, i; - if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head))) + if (*sizep > (INT_MAX / sizeof(struct hlist_nulls_head))) return NULL; BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head)); nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head)); + if (nr_slots > (INT_MAX / sizeof(struct hlist_nulls_head))) + return NULL; + hash = kvcalloc(nr_slots, sizeof(struct hlist_nulls_head), GFP_KERNEL); if (hash && nulls) -- cgit v1.2.3 From 737d4d91d35b5f7fa5bb442651472277318b0bfd Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 7 Jan 2025 13:01:05 +0100 Subject: sched: sch_cake: add bounds checks to host bulk flow fairness counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Even though we fixed a logic error in the commit cited below, syzbot still managed to trigger an underflow of the per-host bulk flow counters, leading to an out of bounds memory access. To avoid any such logic errors causing out of bounds memory accesses, this commit factors out all accesses to the per-host bulk flow counters to a series of helpers that perform bounds-checking before any increments and decrements. This also has the benefit of improving readability by moving the conditional checks for the flow mode into these helpers, instead of having them spread out throughout the code (which was the cause of the original logic error). As part of this change, the flow quantum calculation is consolidated into a helper function, which means that the dithering applied to the ost load scaling is now applied both in the DRR rotation and when a sparse flow's quantum is first initiated. The only user-visible effect of this is that the maximum packet size that can be sent while a flow stays sparse will now vary with +/- one byte in some cases. This should not make a noticeable difference in practice, and thus it's not worth complicating the code to preserve the old behaviour. Fixes: 546ea84d07e3 ("sched: sch_cake: fix bulk flow accounting logic for host fairness") Reported-by: syzbot+f63600d288bfb7057424@syzkaller.appspotmail.com Signed-off-by: Toke Høiland-Jørgensen Acked-by: Dave Taht Link: https://patch.msgid.link/20250107120105.70685-1-toke@redhat.com Signed-off-by: Jakub Kicinski --- net/sched/sch_cake.c | 140 +++++++++++++++++++++++++++------------------------ 1 file changed, 75 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 8d8b2db4653c..2c2e2a67f3b2 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -627,6 +627,63 @@ static bool cake_ddst(int flow_mode) return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST; } +static void cake_dec_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count)) + q->hosts[flow->srchost].srchost_bulk_flow_count--; +} + +static void cake_inc_srchost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_dsrc(flow_mode) && + q->hosts[flow->srchost].srchost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->srchost].srchost_bulk_flow_count++; +} + +static void cake_dec_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count--; +} + +static void cake_inc_dsthost_bulk_flow_count(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + if (likely(cake_ddst(flow_mode) && + q->hosts[flow->dsthost].dsthost_bulk_flow_count < CAKE_QUEUES)) + q->hosts[flow->dsthost].dsthost_bulk_flow_count++; +} + +static u16 cake_get_flow_quantum(struct cake_tin_data *q, + struct cake_flow *flow, + int flow_mode) +{ + u16 host_load = 1; + + if (cake_dsrc(flow_mode)) + host_load = max(host_load, + q->hosts[flow->srchost].srchost_bulk_flow_count); + + if (cake_ddst(flow_mode)) + host_load = max(host_load, + q->hosts[flow->dsthost].dsthost_bulk_flow_count); + + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors + */ + return (q->flow_quantum * quantum_div[host_load] + + get_random_u16()) >> 16; +} + static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb, int flow_mode, u16 flow_override, u16 host_override) { @@ -773,10 +830,8 @@ skip_hash: allocate_dst = cake_ddst(flow_mode); if (q->flows[outer_hash + k].set == CAKE_SET_BULK) { - if (allocate_src) - q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--; - if (allocate_dst) - q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); + cake_dec_dsthost_bulk_flow_count(q, &q->flows[outer_hash + k], flow_mode); } found: /* reserve queue for future packets in same flow */ @@ -801,9 +856,10 @@ found: q->hosts[outer_hash + k].srchost_tag = srchost_hash; found_src: srchost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[srchost_idx].srchost_bulk_flow_count++; q->flows[reduced_hash].srchost = srchost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_srchost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } if (allocate_dst) { @@ -824,9 +880,10 @@ found_src: q->hosts[outer_hash + k].dsthost_tag = dsthost_hash; found_dst: dsthost_idx = outer_hash + k; - if (q->flows[reduced_hash].set == CAKE_SET_BULK) - q->hosts[dsthost_idx].dsthost_bulk_flow_count++; q->flows[reduced_hash].dsthost = dsthost_idx; + + if (q->flows[reduced_hash].set == CAKE_SET_BULK) + cake_inc_dsthost_bulk_flow_count(q, &q->flows[reduced_hash], flow_mode); } } @@ -1839,10 +1896,6 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* flowchain */ if (!flow->set || flow->set == CAKE_SET_DECAYING) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - u16 host_load = 1; - if (!flow->set) { list_add_tail(&flow->flowchain, &b->new_flows); } else { @@ -1852,18 +1905,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - flow->deficit = (b->flow_quantum * - quantum_div[host_load]) >> 16; + flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { - struct cake_host *srchost = &b->hosts[flow->srchost]; - struct cake_host *dsthost = &b->hosts[flow->dsthost]; - /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. */ @@ -1871,12 +1914,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; - + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); } if (q->buffer_used > q->buffer_max_used) @@ -1933,13 +1972,11 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) { struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[q->cur_tin]; - struct cake_host *srchost, *dsthost; ktime_t now = ktime_get(); struct cake_flow *flow; struct list_head *head; bool first_flow = true; struct sk_buff *skb; - u16 host_load; u64 delay; u32 len; @@ -2039,11 +2076,6 @@ retry: q->cur_flow = flow - b->flows; first_flow = false; - /* triple isolation (modified DRR++) */ - srchost = &b->hosts[flow->srchost]; - dsthost = &b->hosts[flow->dsthost]; - host_load = 1; - /* flow isolation (DRR++) */ if (flow->deficit <= 0) { /* Keep all flows with deficits out of the sparse and decaying @@ -2055,11 +2087,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count++; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count++; + cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2071,19 +2100,7 @@ retry: } } - if (cake_dsrc(q->flow_mode)) - host_load = max(host_load, srchost->srchost_bulk_flow_count); - - if (cake_ddst(q->flow_mode)) - host_load = max(host_load, dsthost->dsthost_bulk_flow_count); - - WARN_ON(host_load > CAKE_QUEUES); - - /* The get_random_u16() is a way to apply dithering to avoid - * accumulating roundoff errors - */ - flow->deficit += (b->flow_quantum * quantum_div[host_load] + - get_random_u16()) >> 16; + flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2107,11 +2124,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2129,12 +2143,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - if (cake_dsrc(q->flow_mode)) - srchost->srchost_bulk_flow_count--; - - if (cake_ddst(q->flow_mode)) - dsthost->dsthost_bulk_flow_count--; - + cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); } else b->decaying_flow_count--; -- cgit v1.2.3 From 771ec78dc8b48d562e6015bb535ed3cd37043d78 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:29 +0100 Subject: mptcp: sysctl: avail sched: remove write access 'net.mptcp.available_schedulers' sysctl knob is there to list available schedulers, not to modify this list. There are then no reasons to give write access to it. Nothing would have been written anyway, but no errors would have been returned, which is unexpected. Fixes: 73c900aa3660 ("mptcp: add net.mptcp.available_schedulers") Cc: stable@vger.kernel.org Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-1-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 38d8121331d4..d9b57fab2a13 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -228,7 +228,7 @@ static struct ctl_table mptcp_sysctl_table[] = { { .procname = "available_schedulers", .maxlen = MPTCP_SCHED_BUF_MAX, - .mode = 0644, + .mode = 0444, .proc_handler = proc_available_schedulers, }, { -- cgit v1.2.3 From d38e26e36206ae3d544d496513212ae931d1da0a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:30 +0100 Subject: mptcp: sysctl: sched: avoid using current->nsproxy Using the 'net' structure via 'current' is not recommended for different reasons. First, if the goal is to use it to read or write per-netns data, this is inconsistent with how the "generic" sysctl entries are doing: directly by only using pointers set to the table entry, e.g. table->data. Linked to that, the per-netns data should always be obtained from the table linked to the netns it had been created for, which may not coincide with the reader's or writer's netns. Another reason is that access to current->nsproxy->netns can oops if attempted when current->nsproxy had been dropped when the current task is exiting. This is what syzbot found, when using acct(2): Oops: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 1 UID: 0 PID: 5924 Comm: syz-executor Not tainted 6.13.0-rc5-syzkaller-00004-gccb98ccef0e5 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: proc_sys_call_handler+0x403/0x5d0 fs/proc/proc_sysctl.c:601 __kernel_write_iter+0x318/0xa80 fs/read_write.c:612 __kernel_write+0xf6/0x140 fs/read_write.c:632 do_acct_process+0xcb0/0x14a0 kernel/acct.c:539 acct_pin_kill+0x2d/0x100 kernel/acct.c:192 pin_kill+0x194/0x7c0 fs/fs_pin.c:44 mnt_pin_kill+0x61/0x1e0 fs/fs_pin.c:81 cleanup_mnt+0x3ac/0x450 fs/namespace.c:1366 task_work_run+0x14e/0x250 kernel/task_work.c:239 exit_task_work include/linux/task_work.h:43 [inline] do_exit+0xad8/0x2d70 kernel/exit.c:938 do_group_exit+0xd3/0x2a0 kernel/exit.c:1087 get_signal+0x2576/0x2610 kernel/signal.c:3017 arch_do_signal_or_restart+0x90/0x7e0 arch/x86/kernel/signal.c:337 exit_to_user_mode_loop kernel/entry/common.c:111 [inline] exit_to_user_mode_prepare include/linux/entry-common.h:329 [inline] __syscall_exit_to_user_mode_work kernel/entry/common.c:207 [inline] syscall_exit_to_user_mode+0x150/0x2a0 kernel/entry/common.c:218 do_syscall_64+0xda/0x250 arch/x86/entry/common.c:89 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fee3cb87a6a Code: Unable to access opcode bytes at 0x7fee3cb87a40. RSP: 002b:00007fffcccac688 EFLAGS: 00000202 ORIG_RAX: 0000000000000037 RAX: 0000000000000000 RBX: 00007fffcccac710 RCX: 00007fee3cb87a6a RDX: 0000000000000041 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 00007fffcccac6ac R09: 00007fffcccacac7 R10: 00007fffcccac710 R11: 0000000000000202 R12: 00007fee3cd49500 R13: 00007fffcccac6ac R14: 0000000000000000 R15: 00007fee3cd4b000 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:proc_scheduler+0xc6/0x3c0 net/mptcp/ctrl.c:125 Code: 03 42 80 3c 38 00 0f 85 fe 02 00 00 4d 8b a4 24 08 09 00 00 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 28 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 cc 02 00 00 4d 8b 7c 24 28 48 8d 84 24 c8 00 00 RSP: 0018:ffffc900034774e8 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 1ffff9200068ee9e RCX: ffffc90003477620 RDX: 0000000000000005 RSI: ffffffff8b08f91e RDI: 0000000000000028 RBP: 0000000000000001 R08: ffffc90003477710 R09: 0000000000000040 R10: 0000000000000040 R11: 00000000726f7475 R12: 0000000000000000 R13: ffffc90003477620 R14: ffffc90003477710 R15: dffffc0000000000 FS: 0000000000000000(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fee3cd452d8 CR3: 000000007d116000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 ---------------- Code disassembly (best guess), 1 bytes skipped: 0: 42 80 3c 38 00 cmpb $0x0,(%rax,%r15,1) 5: 0f 85 fe 02 00 00 jne 0x309 b: 4d 8b a4 24 08 09 00 mov 0x908(%r12),%r12 12: 00 13: 48 b8 00 00 00 00 00 movabs $0xdffffc0000000000,%rax 1a: fc ff df 1d: 49 8d 7c 24 28 lea 0x28(%r12),%rdi 22: 48 89 fa mov %rdi,%rdx 25: 48 c1 ea 03 shr $0x3,%rdx * 29: 80 3c 02 00 cmpb $0x0,(%rdx,%rax,1) <-- trapping instruction 2d: 0f 85 cc 02 00 00 jne 0x2ff 33: 4d 8b 7c 24 28 mov 0x28(%r12),%r15 38: 48 rex.W 39: 8d .byte 0x8d 3a: 84 24 c8 test %ah,(%rax,%rcx,8) Here with 'net.mptcp.scheduler', the 'net' structure is not really needed, because the table->data already has a pointer to the current scheduler, the only thing needed from the per-netns data. Simply use 'data', instead of getting (most of the time) the same thing, but from a longer and indirect way. Fixes: 6963c508fd7a ("mptcp: only allow set existing scheduler for net.mptcp.scheduler") Cc: stable@vger.kernel.org Reported-by: syzbot+e364f774c6f57f2c86d1@syzkaller.appspotmail.com Closes: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-2-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index d9b57fab2a13..81c30aa02196 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -102,16 +102,15 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) } #ifdef CONFIG_SYSCTL -static int mptcp_set_scheduler(const struct net *net, const char *name) +static int mptcp_set_scheduler(char *scheduler, const char *name) { - struct mptcp_pernet *pernet = mptcp_get_pernet(net); struct mptcp_sched_ops *sched; int ret = 0; rcu_read_lock(); sched = mptcp_sched_find(name); if (sched) - strscpy(pernet->scheduler, name, MPTCP_SCHED_NAME_MAX); + strscpy(scheduler, name, MPTCP_SCHED_NAME_MAX); else ret = -ENOENT; rcu_read_unlock(); @@ -122,7 +121,7 @@ static int mptcp_set_scheduler(const struct net *net, const char *name) static int proc_scheduler(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - const struct net *net = current->nsproxy->net_ns; + char (*scheduler)[MPTCP_SCHED_NAME_MAX] = ctl->data; char val[MPTCP_SCHED_NAME_MAX]; struct ctl_table tbl = { .data = val, @@ -130,11 +129,11 @@ static int proc_scheduler(const struct ctl_table *ctl, int write, }; int ret; - strscpy(val, mptcp_get_scheduler(net), MPTCP_SCHED_NAME_MAX); + strscpy(val, *scheduler, MPTCP_SCHED_NAME_MAX); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) - ret = mptcp_set_scheduler(net, val); + ret = mptcp_set_scheduler(*scheduler, val); return ret; } -- cgit v1.2.3 From 92cf7a51bdae24a32c592adcdd59a773ae149289 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:31 +0100 Subject: mptcp: sysctl: blackhole timeout: avoid using current->nsproxy As mentioned in the previous commit, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'pernet' structure can be obtained from the table->data using container_of(). Fixes: 27069e7cb3d1 ("mptcp: disable active MPTCP in case of blackhole") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-3-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/ctrl.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index 81c30aa02196..b0dd008e2114 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -160,7 +160,9 @@ static int proc_blackhole_detect_timeout(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct mptcp_pernet *pernet = mptcp_get_pernet(current->nsproxy->net_ns); + struct mptcp_pernet *pernet = container_of(table->data, + struct mptcp_pernet, + blackhole_timeout); int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); -- cgit v1.2.3 From ea62dd1383913b5999f3d16ae99d411f41b528d4 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:32 +0100 Subject: sctp: sysctl: cookie_hmac_alg: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.sctp_hmac_alg' is used. Fixes: 3c68198e7511 ("sctp: Make hmac algorithm selection for cookie generation dynamic") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-4-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index e5a5af343c4c..9848d19630a4 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -387,7 +387,8 @@ static struct ctl_table sctp_net_table[] = { static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.sctp_hmac_alg); struct ctl_table tbl; bool changed = false; char *none = "none"; -- cgit v1.2.3 From 9fc17b76fc70763780aa78b38fcf4742384044a5 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:33 +0100 Subject: sctp: sysctl: rto_min/max: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.rto_min/max' is used. Fixes: 4f3fdf3bc59c ("sctp: add check rto_min and rto_max in sysctl") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-5-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9848d19630a4..a5285815264d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -433,7 +433,7 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_min); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; @@ -461,7 +461,7 @@ static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, static int proc_sctp_do_rto_max(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.rto_max); unsigned int min = *(unsigned int *) ctl->extra1; unsigned int max = *(unsigned int *) ctl->extra2; struct ctl_table tbl; -- cgit v1.2.3 From 15649fd5415eda664ef35780c2013adeb5d9c695 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:34 +0100 Subject: sctp: sysctl: auth_enable: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: b14878ccb7fa ("net: sctp: cache auth_enable per endpoint") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-6-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index a5285815264d..9d29611621fe 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -499,7 +499,7 @@ static int proc_sctp_do_alpha_beta(const struct ctl_table *ctl, int write, static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.auth_enable); struct ctl_table tbl; int new_value, ret; -- cgit v1.2.3 From c10377bbc1972d858eaf0ab366a311b39f8ef1b6 Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:35 +0100 Subject: sctp: sysctl: udp_port: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, but that would increase the size of this fix, while 'sctp.ctl_sock' still needs to be retrieved from 'net' structure. Fixes: 046c052b475e ("sctp: enable udp tunneling socks") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-7-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 9d29611621fe..18fa4f44e8ec 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -528,7 +528,7 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write, static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, sctp.udp_port); unsigned int min = *(unsigned int *)ctl->extra1; unsigned int max = *(unsigned int *)ctl->extra2; struct ctl_table tbl; -- cgit v1.2.3 From 6259d2484d0ceff42245d1f09cc8cb6ee72d847a Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:36 +0100 Subject: sctp: sysctl: plpmtud_probe_interval: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The 'net' structure can be obtained from the table->data using container_of(). Note that table->data could also be used directly, as this is the only member needed from the 'net' structure, but that would increase the size of this fix, to use '*data' everywhere 'net->sctp.probe_interval' is used. Fixes: d1e462a7a5f3 ("sctp: add probe_interval in sysctl and sock/asoc/transport") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-8-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/sctp/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 18fa4f44e8ec..8e1e97be4df7 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -569,7 +569,8 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write, static int proc_sctp_do_probe_interval(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct net *net = current->nsproxy->net_ns; + struct net *net = container_of(ctl->data, struct net, + sctp.probe_interval); struct ctl_table tbl; int ret, new_value; -- cgit v1.2.3 From 7f5611cbc4871c7fb1ad36c2e5a9edad63dca95c Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Wed, 8 Jan 2025 16:34:37 +0100 Subject: rds: sysctl: rds_tcp_{rcv,snd}buf: avoid using current->nsproxy As mentioned in a previous commit of this series, using the 'net' structure via 'current' is not recommended for different reasons: - Inconsistency: getting info from the reader's/writer's netns vs only from the opener's netns. - current->nsproxy can be NULL in some cases, resulting in an 'Oops' (null-ptr-deref), e.g. when the current task is exiting, as spotted by syzbot [1] using acct(2). The per-netns structure can be obtained from the table->data using container_of(), then the 'net' one can be retrieved from the listen socket (if available). Fixes: c6a58ffed536 ("RDS: TCP: Add sysctl tunables for sndbuf/rcvbuf on rds-tcp socket") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/67769ecb.050a0220.3a8527.003f.GAE@google.com [1] Suggested-by: Al Viro Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250108-net-sysctl-current-nsproxy-v1-9-5df34b2083e8@kernel.org Signed-off-by: Jakub Kicinski --- net/rds/tcp.c | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 351ac1747224..0581c53e6517 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -61,8 +61,10 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, - void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -74,7 +76,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_sndbuf_handler, .extra1 = &rds_tcp_min_sndbuf, }, #define RDS_TCP_RCVBUF 1 @@ -83,7 +85,7 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* data is per-net pointer */ .maxlen = sizeof(int), .mode = 0644, - .proc_handler = rds_tcp_skbuf_handler, + .proc_handler = rds_tcp_rcvbuf_handler, .extra1 = &rds_tcp_min_rcvbuf, }, }; @@ -682,10 +684,10 @@ static void rds_tcp_sysctl_reset(struct net *net) spin_unlock_irq(&rds_tcp_conn_lock); } -static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, +static int rds_tcp_skbuf_handler(struct rds_tcp_net *rtn, + const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *fpos) { - struct net *net = current->nsproxy->net_ns; int err; err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos); @@ -694,11 +696,34 @@ static int rds_tcp_skbuf_handler(const struct ctl_table *ctl, int write, *(int *)(ctl->extra1)); return err; } - if (write) + + if (write && rtn->rds_tcp_listen_sock && rtn->rds_tcp_listen_sock->sk) { + struct net *net = sock_net(rtn->rds_tcp_listen_sock->sk); + rds_tcp_sysctl_reset(net); + } + return 0; } +static int rds_tcp_sndbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + sndbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + +static int rds_tcp_rcvbuf_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *fpos) +{ + struct rds_tcp_net *rtn = container_of(ctl->data, struct rds_tcp_net, + rcvbuf_size); + + return rds_tcp_skbuf_handler(rtn, ctl, write, buffer, lenp, fpos); +} + static void rds_tcp_exit(void) { rds_tcp_set_unloading(); -- cgit v1.2.3