From 54f89b3178d5448dd4457afbb98fc1ab99090a65 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 10 Dec 2024 01:20:38 +0000 Subject: tcp_bpf: Charge receive socket buffer in bpf_tcp_ingress() When bpf_tcp_ingress() is called, the skmsg is being redirected to the ingress of the destination socket. Therefore, we should charge its receive socket buffer, instead of sending socket buffer. Because sk_rmem_schedule() tests pfmemalloc of skb, we need to introduce a wrapper and call it for skmsg. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-2-zijianzhang@bytedance.com --- net/ipv4/tcp_bpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 99cef92e6290..b21ea634909c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -49,7 +49,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, sge = sk_msg_elem(msg, i); size = (apply && apply_bytes < sge->length) ? apply_bytes : sge->length; - if (!sk_wmem_schedule(sk, size)) { + if (!__sk_rmem_schedule(sk, size, false)) { if (!copied) ret = -ENOMEM; break; -- cgit v1.2.3 From d888b7af7c149c115dd6ac772cc11c375da3e17c Mon Sep 17 00:00:00 2001 From: Zijian Zhang Date: Tue, 10 Dec 2024 01:20:39 +0000 Subject: tcp_bpf: Add sk_rmem_alloc related logic for tcp_bpf ingress redirection When we do sk_psock_verdict_apply->sk_psock_skb_ingress, an sk_msg will be created out of the skb, and the rmem accounting of the sk_msg will be handled by the skb. For skmsgs in __SK_REDIRECT case of tcp_bpf_send_verdict, when redirecting to the ingress of a socket, although we sk_rmem_schedule and add sk_msg to the ingress_msg of sk_redir, we do not update sk_rmem_alloc. As a result, except for the global memory limit, the rmem of sk_redir is nearly unlimited. Thus, add sk_rmem_alloc related logic to limit the recv buffer. Since the function sk_msg_recvmsg and __sk_psock_purge_ingress_msg are used in these two paths. We use "msg->skb" to test whether the sk_msg is skb backed up. If it's not, we shall do the memory accounting explicitly. Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Zijian Zhang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241210012039.1669389-3-zijianzhang@bytedance.com --- include/linux/skmsg.h | 11 ++++++++--- net/core/skmsg.c | 6 +++++- net/ipv4/tcp_bpf.c | 4 +++- 3 files changed, 16 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index d9b03e0746e7..2cbe0c22a32f 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -317,17 +317,22 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); } -static inline void sk_psock_queue_msg(struct sk_psock *psock, +static inline bool sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + bool ret; + spin_lock_bh(&psock->ingress_lock); - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { list_add_tail(&msg->list, &psock->ingress_msg); - else { + ret = true; + } else { sk_msg_free(psock->sk, msg); kfree(msg); + ret = false; } spin_unlock_bh(&psock->ingress_lock); + return ret; } static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e90fbab703b2..8ad7e6755fd6 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -445,8 +445,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { sk_mem_uncharge(sk, copy); + atomic_sub(copy, &sk->sk_rmem_alloc); + } msg_rx->sg.size -= copy; if (!sge->length) { @@ -772,6 +774,8 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b21ea634909c..392678ae80f4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -56,6 +56,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, } sk_mem_charge(sk, size); + atomic_add(size, &sk->sk_rmem_alloc); sk_msg_xfer(tmp, msg, i, size); copied += size; if (sge->length) @@ -74,7 +75,8 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - sk_psock_queue_msg(psock, tmp); + if (!sk_psock_queue_msg(psock, tmp)) + atomic_sub(copied, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp); -- cgit v1.2.3 From fdf478d236dcf0f1f68534df5d456ced625195bd Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:22 +0800 Subject: skmsg: Return copied bytes in sk_msg_memcopy_from_iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously sk_msg_memcopy_from_iter returns the copied bytes from the last copy_from_iter{,_nocache} call upon success. This commit changes it to return the total number of copied bytes on success. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-1-bae583d014f3@outlook.com --- net/core/skmsg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8ad7e6755fd6..61f3f3d4e528 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -369,8 +369,8 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, struct sk_msg *msg, u32 bytes) { int ret = -ENOSPC, i = msg->sg.curr; + u32 copy, buf_size, copied = 0; struct scatterlist *sge; - u32 copy, buf_size; void *to; do { @@ -397,6 +397,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, goto out; } bytes -= copy; + copied += copy; if (!bytes) break; msg->sg.copybreak = 0; @@ -404,7 +405,7 @@ int sk_msg_memcopy_from_iter(struct sock *sk, struct iov_iter *from, } while (i != msg->sg.end); out: msg->sg.curr = i; - return ret; + return (ret < 0) ? ret : copied; } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); -- cgit v1.2.3 From 5153a75ef34b3f7478ca918044d0f05eed8fb3f9 Mon Sep 17 00:00:00 2001 From: Levi Zim Date: Sat, 30 Nov 2024 21:38:23 +0800 Subject: tcp_bpf: Fix copied value in tcp_bpf_sendmsg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bpf kselftest sockhash::test_txmsg_cork_hangs in test_sockmap.c triggers a kernel NULL pointer dereference: BUG: kernel NULL pointer dereference, address: 0000000000000008 ? __die_body+0x6e/0xb0 ? __die+0x8b/0xa0 ? page_fault_oops+0x358/0x3c0 ? local_clock+0x19/0x30 ? lock_release+0x11b/0x440 ? kernelmode_fixup_or_oops+0x54/0x60 ? __bad_area_nosemaphore+0x4f/0x210 ? mmap_read_unlock+0x13/0x30 ? bad_area_nosemaphore+0x16/0x20 ? do_user_addr_fault+0x6fd/0x740 ? prb_read_valid+0x1d/0x30 ? exc_page_fault+0x55/0xd0 ? asm_exc_page_fault+0x2b/0x30 ? splice_to_socket+0x52e/0x630 ? shmem_file_splice_read+0x2b1/0x310 direct_splice_actor+0x47/0x70 splice_direct_to_actor+0x133/0x300 ? do_splice_direct+0x90/0x90 do_splice_direct+0x64/0x90 ? __ia32_sys_tee+0x30/0x30 do_sendfile+0x214/0x300 __se_sys_sendfile64+0x8e/0xb0 __x64_sys_sendfile64+0x25/0x30 x64_sys_call+0xb82/0x2840 do_syscall_64+0x75/0x110 entry_SYSCALL_64_after_hwframe+0x4b/0x53 This is caused by tcp_bpf_sendmsg() returning a larger value(12289) than size (8192), which causes the while loop in splice_to_socket() to release an uninitialized pipe buf. The underlying cause is that this code assumes sk_msg_memcopy_from_iter() will copy all bytes upon success but it actually might only copy part of it. This commit changes it to use the real copied bytes. Signed-off-by: Levi Zim Signed-off-by: Daniel Borkmann Tested-by: Björn Töpel Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20241130-tcp-bpf-sendmsg-v1-2-bae583d014f3@outlook.com --- net/ipv4/tcp_bpf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 392678ae80f4..47f65b1b70ca 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -495,7 +495,7 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int copied = 0, err = 0; + int copied = 0, err = 0, ret = 0; struct sk_psock *psock; long timeo; int flags; @@ -538,14 +538,14 @@ static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) copy = msg_tx->sg.size - osize; } - err = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, + ret = sk_msg_memcopy_from_iter(sk, &msg->msg_iter, msg_tx, copy); - if (err < 0) { + if (ret < 0) { sk_msg_trim(sk, msg_tx, osize); goto out_err; } - copied += copy; + copied += ret; if (psock->cork_bytes) { if (size > psock->cork_bytes) psock->cork_bytes = 0; -- cgit v1.2.3 From 9ecc4d858b92c1bb0673ad9c327298e600c55659 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 12 Dec 2024 19:40:54 -0800 Subject: bpf: Check negative offsets in __bpf_skb_min_len() skb_network_offset() and skb_transport_offset() can be negative when they are called after we pull the transport header, for example, when we use eBPF sockmap at the point of ->sk_data_ready(). __bpf_skb_min_len() uses an unsigned int to get these offsets, this leads to a very large number which then causes bpf_skb_change_tail() failed unexpectedly. Fix this by using a signed int to get these offsets and ensure the minimum is at least zero. Fixes: 5293efe62df8 ("bpf: add bpf_skb_change_tail helper") Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20241213034057.246437-2-xiyou.wangcong@gmail.com --- net/core/filter.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 21131ec25f24..834614071727 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3734,13 +3734,22 @@ static const struct bpf_func_proto bpf_skb_adjust_room_proto = { static u32 __bpf_skb_min_len(const struct sk_buff *skb) { - u32 min_len = skb_network_offset(skb); + int offset = skb_network_offset(skb); + u32 min_len = 0; - if (skb_transport_header_was_set(skb)) - min_len = skb_transport_offset(skb); - if (skb->ip_summed == CHECKSUM_PARTIAL) - min_len = skb_checksum_start_offset(skb) + - skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + if (skb_transport_header_was_set(skb)) { + offset = skb_transport_offset(skb); + if (offset > 0) + min_len = offset; + } + if (skb->ip_summed == CHECKSUM_PARTIAL) { + offset = skb_checksum_start_offset(skb) + + skb->csum_offset + sizeof(__sum16); + if (offset > 0) + min_len = offset; + } return min_len; } -- cgit v1.2.3