summaryrefslogtreecommitdiff
path: root/net/ipv4/tcp_minisocks.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 15:17:01 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2025-10-02 15:17:01 -0700
commit07fdad3a93756b872da7b53647715c48d0f4a2d0 (patch)
tree133af559ac91e6b24358b57a025abc060a782129 /net/ipv4/tcp_minisocks.c
parentf79e772258df311c2cb21594ca0996318e720d28 (diff)
parentf1455695d2d99894b65db233877acac9a0e120b9 (diff)
Merge tag 'net-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "Core & protocols: - Improve drop account scalability on NUMA hosts for RAW and UDP sockets and the backlog, almost doubling the Pps capacity under DoS - Optimize the UDP RX performance under stress, reducing contention, revisiting the binary layout of the involved data structs and implementing NUMA-aware locking. This improves UDP RX performance by an additional 50%, even more under extreme conditions - Add support for PSP encryption of TCP connections; this mechanism has some similarities with IPsec and TLS, but offers superior HW offloads capabilities - Ongoing work to support Accurate ECN for TCP. AccECN allows more than one congestion notification signal per RTT and is a building block for Low Latency, Low Loss, and Scalable Throughput (L4S) - Reorganize the TCP socket binary layout for data locality, reducing the number of touched cachelines in the fastpath - Refactor skb deferral free to better scale on large multi-NUMA hosts, this improves TCP and UDP RX performances significantly on such HW - Increase the default socket memory buffer limits from 256K to 4M to better fit modern link speeds - Improve handling of setups with a large number of nexthop, making dump operating scaling linearly and avoiding unneeded synchronize_rcu() on delete - Improve bridge handling of VLAN FDB, storing a single entry per bridge instead of one entry per port; this makes the dump order of magnitude faster on large switches - Restore IP ID correctly for encapsulated packets at GSO segmentation time, allowing GRO to merge packets in more scenarios - Improve netfilter matching performance on large sets - Improve MPTCP receive path performance by leveraging recently introduced core infrastructure (skb deferral free) and adopting recent TCP autotuning changes - Allow bridges to redirect to a backup port when the bridge port is administratively down - Introduce MPTCP 'laminar' endpoint that con be used only once per connection and simplify common MPTCP setups - Add RCU safety to dst->dev, closing a lot of possible races - A significant crypto library API for SCTP, MPTCP and IPv6 SR, reducing code duplication - Supports pulling data from an skb frag into the linear area of an XDP buffer Things we sprinkled into general kernel code: - Generate netlink documentation from YAML using an integrated YAML parser Driver API: - Support using IPv6 Flow Label in Rx hash computation and RSS queue selection - Introduce API for fetching the DMA device for a given queue, allowing TCP zerocopy RX on more H/W setups - Make XDP helpers compatible with unreadable memory, allowing more easily building DevMem-enabled drivers with a unified XDP/skbs datapath - Add a new dedicated ethtool callback enabling drivers to provide the number of RX rings directly, improving efficiency and clarity in RX ring queries and RSS configuration - Introduce a burst period for the health reporter, allowing better handling of multiple errors due to the same root cause - Support for DPLL phase offset exponential moving average, controlling the average smoothing factor Device drivers: - Add a new Huawei driver for 3rd gen NIC (hinic3) - Add a new SpacemiT driver for K1 ethernet MAC - Add a generic abstraction for shared memory communication devices (dibps) - Ethernet high-speed NICs: - nVidia/Mellanox: - Use multiple per-queue doorbell, to avoid MMIO contention issues - support adjacent functions, allowing them to delegate their SR-IOV VFs to sibling PFs - support RSS for IPSec offload - support exposing raw cycle counters in PTP and mlx5 - support for disabling host PFs. - Intel (100G, ice, idpf): - ice: support for SRIOV VFs over an Active-Active link aggregate - ice: support for firmware logging via debugfs - ice: support for Earliest TxTime First (ETF) hardware offload - idpf: support basic XDP functionalities and XSk - Broadcom (bnxt): - support Hyper-V VF ID - dynamic SRIOV resource allocations for RoCE - Meta (fbnic): - support queue API, zero-copy Rx and Tx - support basic XDP functionalities - devlink health support for FW crashes and OTP mem corruptions - expand hardware stats coverage to FEC, PHY, and Pause - Wangxun: - support ethtool coalesce options - support for multiple RSS contexts - Ethernet virtual: - Macsec: - replace custom netlink attribute checks with policy-level checks - Bonding: - support aggregator selection based on port priority - Microsoft vNIC: - use page pool fragments for RX buffers instead of full pages to improve memory efficiency - Ethernet NICs consumer, and embedded: - Qualcomm: support Ethernet function for IPQ9574 SoC - Airoha: implement wlan offloading via NPU - Freescale - enetc: add NETC timer PTP driver and add PTP support - fec: enable the Jumbo frame support for i.MX8QM - Renesas (R-Car S4): - support HW offloading for layer 2 switching - support for RZ/{T2H, N2H} SoCs - Cadence (macb): support TAPRIO traffic scheduling - TI: - support for Gigabit ICSS ethernet SoC (icssm-prueth) - Synopsys (stmmac): a lot of cleanups - Ethernet PHYs: - Support 10g-qxgmi phy-mode for AQR412C, Felix DSA and Lynx PCS driver - Support bcm63268 GPHY power control - Support for Micrel lan8842 PHY and PTP - Support for Aquantia AQR412 and AQR115 - CAN: - a large CAN-XL preparation work - reorganize raw_sock and uniqframe struct to minimize memory usage - rcar_canfd: update the CAN-FD handling - WiFi: - extended Neighbor Awareness Networking (NAN) support - S1G channel representation cleanup - improve S1G support - WiFi drivers: - Intel (iwlwifi): - major refactor and cleanup - Broadcom (brcm80211): - support for AP isolation - RealTek (rtw88/89) rtw88/89: - preparation work for RTL8922DE support - MediaTek (mt76): - HW restart improvements - MLO support - Qualcomm/Atheros (ath10k): - GTK rekey fixes - Bluetooth drivers: - btusb: support for several new IDs for MT7925 - btintel: support for BlazarIW core - btintel_pcie: support for _suspend() / _resume() - btintel_pcie: support for Scorpious, Panther Lake-H484 IDs" * tag 'net-next-6.18' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1536 commits) net: stmmac: Add support for Allwinner A523 GMAC200 dt-bindings: net: sun8i-emac: Add A523 GMAC200 compatible Revert "Documentation: net: add flow control guide and document ethtool API" octeontx2-pf: fix bitmap leak octeontx2-vf: fix bitmap leak net/mlx5e: Use extack in set rxfh callback net/mlx5e: Introduce mlx5e_rss_params for RSS configuration net/mlx5e: Introduce mlx5e_rss_init_params net/mlx5e: Remove unused mdev param from RSS indir init net/mlx5: Improve QoS error messages with actual depth values net/mlx5e: Prevent entering switchdev mode with inconsistent netns net/mlx5: HWS, Generalize complex matchers net/mlx5: Improve write-combining test reliability for ARM64 Grace CPUs selftests/net: add tcp_port_share to .gitignore Revert "net/mlx5e: Update and set Xon/Xoff upon MTU set" net: add NUMA awareness to skb_attempt_defer_free() net: use llist for sd->defer_list net: make softnet_data.defer_count an atomic selftests: drv-net: psp: add tests for destroying devices selftests: drv-net: psp: add test for auto-adjusting TCP MSS ...
Diffstat (limited to 'net/ipv4/tcp_minisocks.c')
-rw-r--r--net/ipv4/tcp_minisocks.c80
1 files changed, 59 insertions, 21 deletions
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2994c9222c9c..2ec8c6f1cdcc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,9 +20,11 @@
*/
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
+#include <net/psp.h>
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -103,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
struct tcp_options_received tmp_opt;
+ enum skb_drop_reason psp_drop;
bool paws_reject = false;
int ts_recent_stamp;
+ /* Instead of dropping immediately, wait to see what value is
+ * returned. We will accept a non psp-encapsulated syn in the
+ * case where TCP_TW_SYN is returned.
+ */
+ psp_drop = psp_twsk_rx_policy_check(tw, skb);
+
tmp_opt.saw_tstamp = 0;
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
@@ -123,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
/* Just repeat all the checks of tcp_rcv_state_process() */
+ if (psp_drop)
+ goto out_put;
+
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
@@ -193,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
+ if (psp_drop)
+ goto out_put;
+
if (th->rst) {
/* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
@@ -246,6 +261,9 @@ kill:
return TCP_TW_SYN;
}
+ if (psp_drop)
+ goto out_put;
+
if (paws_reject) {
*drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
__NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
@@ -264,6 +282,8 @@ kill:
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
}
+
+out_put:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -377,31 +397,22 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
EXPORT_SYMBOL(tcp_time_wait);
-#ifdef CONFIG_TCP_MD5SIG
-static void tcp_md5_twsk_free_rcu(struct rcu_head *head)
-{
- struct tcp_md5sig_key *key;
-
- key = container_of(head, struct tcp_md5sig_key, rcu);
- kfree(key);
- static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
-}
-#endif
-
void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
if (static_branch_unlikely(&tcp_md5_needed.key)) {
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_key)
- call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
+ if (twsk->tw_md5_key) {
+ kfree(twsk->tw_md5_key);
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ tcp_md5_release_sigpool();
+ }
}
#endif
tcp_ao_destroy_sock(sk, true);
+ psp_twsk_assoc_free(inet_twsk(sk));
}
-EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor);
void tcp_twsk_purge(struct list_head *net_exit_list)
{
@@ -461,12 +472,26 @@ void tcp_openreq_init_rwin(struct request_sock *req,
ireq->rcv_wscale = rcv_wscale;
}
-static void tcp_ecn_openreq_child(struct tcp_sock *tp,
- const struct request_sock *req)
+static void tcp_ecn_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
{
- tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
- TCP_ECN_MODE_RFC3168 :
- TCP_ECN_DISABLED);
+ const struct tcp_request_sock *treq = tcp_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (treq->accecn_ok) {
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ tp->syn_ect_snt = treq->syn_ect_snt;
+ tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->saw_accecn_opt = treq->saw_accecn_opt;
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
+ tcp_ecn_received_counters_payload(sk, skb);
+ } else {
+ tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
+ TCP_ECN_MODE_RFC3168 :
+ TCP_ECN_DISABLED);
+ }
}
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
@@ -631,7 +656,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
- tcp_ecn_openreq_child(newtp, req);
+ tcp_ecn_openreq_child(newsk, req, skb);
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
@@ -674,6 +699,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool own_req;
tmp_opt.saw_tstamp = 0;
+ tmp_opt.accecn = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
@@ -851,6 +877,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
+ if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
+ tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
+
+ tcp_rsk(req)->saw_accecn_opt = saw_opt;
+ if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
+ u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+ tcp_rsk(req)->accecn_fail_mode |= fail_mode;
+ }
+ }
+
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/