summaryrefslogtreecommitdiff
path: root/net/ipv6/tcp_ipv6.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 15:17:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 15:17:01 -0700
commit07fdad3a93756b872da7b53647715c48d0f4a2d0 (patch)
tree133af559ac91e6b24358b57a025abc060a782129 /net/ipv6/tcp_ipv6.c
parentf79e772258df311c2cb21594ca0996318e720d28 (diff)
parentf1455695d2d99894b65db233877acac9a0e120b9 (diff)
Merge tag 'net-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core & protocols: - Improve drop account scalability on NUMA hosts for RAW and UDP sockets and the backlog, almost doubling the Pps capacity under DoS - Optimize the UDP RX performance under stress, reducing contention, revisiting the binary layout of the involved data structs and implementing NUMA-aware locking. This improves UDP RX performance by an additional 50%, even more under extreme conditions - Add support for PSP encryption of TCP connections; this mechanism has some similarities with IPsec and TLS, but offers superior HW offloads capabilities - Ongoing work to support Accurate ECN for TCP. AccECN allows more than one congestion notification signal per RTT and is a building block for Low Latency, Low Loss, and Scalable Throughput (L4S) - Reorganize the TCP socket binary layout for data locality, reducing the number of touched cachelines in the fastpath - Refactor skb deferral free to better scale on large multi-NUMA hosts, this improves TCP and UDP RX performances significantly on such HW - Increase the default socket memory buffer limits from 256K to 4M to better fit modern link speeds - Improve handling of setups with a large number of nexthop, making dump operating scaling linearly and avoiding unneeded synchronize_rcu() on delete - Improve bridge handling of VLAN FDB, storing a single entry per bridge instead of one entry per port; this makes the dump order of magnitude faster on large switches - Restore IP ID correctly for encapsulated packets at GSO segmentation time, allowing GRO to merge packets in more scenarios - Improve netfilter matching performance on large sets - Improve MPTCP receive path performance by leveraging recently introduced core infrastructure (skb deferral free) and adopting recent TCP autotuning changes - Allow bridges to redirect to a backup port when the bridge port is administratively down - Introduce MPTCP 'laminar' endpoint that con be used only once per connection and simplify common MPTCP setups - Add RCU safety to dst->dev, closing a lot of possible races - A significant crypto library API for SCTP, MPTCP and IPv6 SR, reducing code duplication - Supports pulling data from an skb frag into the linear area of an XDP buffer Things we sprinkled into general kernel code: - Generate netlink documentation from YAML using an integrated YAML parser Driver API: - Support using IPv6 Flow Label in Rx hash computation and RSS queue selection - Introduce API for fetching the DMA device for a given queue, allowing TCP zerocopy RX on more H/W setups - Make XDP helpers compatible with unreadable memory, allowing more easily building DevMem-enabled drivers with a unified XDP/skbs datapath - Add a new dedicated ethtool callback enabling drivers to provide the number of RX rings directly, improving efficiency and clarity in RX ring queries and RSS configuration - Introduce a burst period for the health reporter, allowing better handling of multiple errors due to the same root cause - Support for DPLL phase offset exponential moving average, controlling the average smoothing factor Device drivers: - Add a new Huawei driver for 3rd gen NIC (hinic3) - Add a new SpacemiT driver for K1 ethernet MAC - Add a generic abstraction for shared memory communication devices (dibps) - Ethernet high-speed NICs: - nVidia/Mellanox: - Use multiple per-queue doorbell, to avoid MMIO contention issues - support adjacent functions, allowing them to delegate their SR-IOV VFs to sibling PFs - support RSS for IPSec offload - support exposing raw cycle counters in PTP and mlx5 - support for disabling host PFs. - Intel (100G, ice, idpf): - ice: support for SRIOV VFs over an Active-Active link aggregate - ice: support for firmware logging via debugfs - ice: support for Earliest TxTime First (ETF) hardware offload - idpf: support basic XDP functionalities and XSk - Broadcom (bnxt): - support Hyper-V VF ID - dynamic SRIOV resource allocations for RoCE - Meta (fbnic): - support queue API, zero-copy Rx and Tx - support basic XDP functionalities - devlink health support for FW crashes and OTP mem corruptions - expand hardware stats coverage to FEC, PHY, and Pause - Wangxun: - support ethtool coalesce options - support for multiple RSS contexts - Ethernet virtual: - Macsec: - replace custom netlink attribute checks with policy-level checks - Bonding: - support aggregator selection based on port priority - Microsoft vNIC: - use page pool fragments for RX buffers instead of full pages to improve memory efficiency - Ethernet NICs consumer, and embedded: - Qualcomm: support Ethernet function for IPQ9574 SoC - Airoha: implement wlan offloading via NPU - Freescale - enetc: add NETC timer PTP driver and add PTP support - fec: enable the Jumbo frame support for i.MX8QM - Renesas (R-Car S4): - support HW offloading for layer 2 switching - support for RZ/{T2H, N2H} SoCs - Cadence (macb): support TAPRIO traffic scheduling - TI: - support for Gigabit ICSS ethernet SoC (icssm-prueth) - Synopsys (stmmac): a lot of cleanups - Ethernet PHYs: - Support 10g-qxgmi phy-mode for AQR412C, Felix DSA and Lynx PCS driver - Support bcm63268 GPHY power control - Support for Micrel lan8842 PHY and PTP - Support for Aquantia AQR412 and AQR115 - CAN: - a large CAN-XL preparation work - reorganize raw_sock and uniqframe struct to minimize memory usage - rcar_canfd: update the CAN-FD handling - WiFi: - extended Neighbor Awareness Networking (NAN) support - S1G channel representation cleanup - improve S1G support - WiFi drivers: - Intel (iwlwifi): - major refactor and cleanup - Broadcom (brcm80211): - support for AP isolation - RealTek (rtw88/89) rtw88/89: - preparation work for RTL8922DE support - MediaTek (mt76): - HW restart improvements - MLO support - Qualcomm/Atheros (ath10k): - GTK rekey fixes - Bluetooth drivers: - btusb: support for several new IDs for MT7925 - btintel: support for BlazarIW core - btintel_pcie: support for _suspend() / _resume() - btintel_pcie: support for Scorpious, Panther Lake-H484 IDs" * tag 'net-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1536 commits) net: stmmac: Add support for Allwinner A523 GMAC200 dt-bindings: net: sun8i-emac: Add A523 GMAC200 compatible Revert "Documentation: net: add flow control guide and document ethtool API" octeontx2-pf: fix bitmap leak octeontx2-vf: fix bitmap leak net/mlx5e: Use extack in set rxfh callback net/mlx5e: Introduce mlx5e_rss_params for RSS configuration net/mlx5e: Introduce mlx5e_rss_init_params net/mlx5e: Remove unused mdev param from RSS indir init net/mlx5: Improve QoS error messages with actual depth values net/mlx5e: Prevent entering switchdev mode with inconsistent netns net/mlx5: HWS, Generalize complex matchers net/mlx5: Improve write-combining test reliability for ARM64 Grace CPUs selftests/net: add tcp_port_share to .gitignore Revert "net/mlx5e: Update and set Xon/Xoff upon MTU set" net: add NUMA awareness to skb_attempt_defer_free() net: use llist for sd->defer_list net: make softnet_data.defer_count an atomic selftests: drv-net: psp: add tests for destroying devices selftests: drv-net: psp: add test for auto-adjusting TCP MSS ...
Diffstat (limited to 'net/ipv6/tcp_ipv6.c')
-rw-r--r--net/ipv6/tcp_ipv6.c56
1 files changed, 35 insertions, 21 deletions
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e885629312a4..9622c2776ade 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -62,6 +62,7 @@
#include <net/hotdata.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -299,12 +300,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
sk->sk_gso_type = SKB_GSO_TCPV6;
- ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
- icsk->icsk_ext_hdr_len = 0;
+ icsk->icsk_ext_hdr_len = psp_sk_overhead(sk);
if (opt)
- icsk->icsk_ext_hdr_len = opt->opt_flen +
- opt->opt_nflen;
+ icsk->icsk_ext_hdr_len += opt->opt_flen +
+ opt->opt_nflen;
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
@@ -388,8 +389,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bool fatal;
int err;
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->daddr, th->dest,
+ sk = __inet6_lookup_established(net, &hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
@@ -545,6 +545,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
+ tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK;
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
@@ -973,6 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
if (sk) {
/* unconstify the socket only to attach it to buff with care. */
skb_set_owner_edemux(buff, (struct sock *)sk);
+ psp_reply_set_decrypted(buff);
if (sk->sk_state == TCP_TIME_WAIT)
mark = inet_twsk(sk)->tw_mark;
@@ -1073,8 +1075,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- NULL, 0, &ipv6h->saddr, th->source,
+ sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->source),
dif, sdif);
if (!sk1)
@@ -1460,7 +1461,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ip6_dst_store(newsk, dst, NULL, NULL);
+ ip6_dst_store(newsk, dst, false, false);
newnp->saddr = ireq->ir_v6_loc_addr;
@@ -1606,6 +1607,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
+
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
@@ -1685,6 +1690,7 @@ csum_err:
reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -1787,7 +1793,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
lookup:
- sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th),
+ sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
&refcounted);
if (!sk)
@@ -1809,7 +1815,7 @@ lookup:
&hdr->saddr, &hdr->daddr,
AF_INET6, dif, sdif);
if (drop_reason) {
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -1948,7 +1954,7 @@ discard_it:
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
@@ -1974,8 +1980,7 @@ do_time_wait:
{
struct sock *sk2;
- sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th),
+ sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
ntohs(th->dest),
@@ -1990,6 +1995,10 @@ do_time_wait:
__this_cpu_write(tcp_tw_isn, isn);
goto process;
}
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
/* to ACK */
fallthrough;
@@ -2027,8 +2036,7 @@ void tcp_v6_early_demux(struct sk_buff *skb)
return;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->saddr, th->source,
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
inet6_iif(skb), inet6_sdif(skb));
if (sk) {
@@ -2048,7 +2056,6 @@ void tcp_v6_early_demux(struct sk_buff *skb)
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
- .twsk_destructor = tcp_twsk_destructor,
};
INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
@@ -2115,6 +2122,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
#endif
};
+
+static void tcp6_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet6_sock_destruct(sk);
+}
#endif
/* NOTE: A lot of things set to zero explicitly by call to
@@ -2130,6 +2144,7 @@ static int tcp_v6_init_sock(struct sock *sk)
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
+ sk->sk_destruct = tcp6_destruct_sock;
#endif
return 0;
@@ -2228,9 +2243,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
+ READ_ONCE(icsk->icsk_retransmits),
from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
jiffies_to_clock_t(icsk->icsk_rto),
@@ -2340,7 +2355,7 @@ struct proto tcpv6_prot = {
.splice_eof = tcp_splice_eof,
.backlog_rcv = tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
- .hash = inet6_hash,
+ .hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.put_port = inet_put_port,
@@ -2356,7 +2371,6 @@ struct proto tcpv6_prot = {
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
.memory_pressure = &tcp_memory_pressure,
- .orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),