diff options
| author | Jakub Kicinski <kuba@kernel.org> | 2025-09-11 17:37:09 -0700 | 
|---|---|---|
| committer | Jakub Kicinski <kuba@kernel.org> | 2025-09-11 17:40:13 -0700 | 
| commit | fc3a2810412c163b5df1b377d332e048860f45db (patch) | |
| tree | 9eeb81c7f965176a32ca3062aefcc3532c637b01 /net | |
| parent | 5f790208d68fe1526c751dc2af366c7b552b8631 (diff) | |
| parent | db87bd2ad1f736c2f7ab231f9b40c885934f6b2c (diff) | |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR (net-6.17-rc6).
Conflicts:
net/netfilter/nft_set_pipapo.c
net/netfilter/nft_set_pipapo_avx2.c
  c4eaca2e1052 ("netfilter: nft_set_pipapo: don't check genbit from packetpath lookups")
  84c1da7b38d9 ("netfilter: nft_set_pipapo: use avx2 algorithm for insertions too")
Only trivial adjacent changes (in a doc and a Makefile).
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
| -rw-r--r-- | net/bridge/br.c | 7 | ||||
| -rw-r--r-- | net/can/j1939/bus.c | 5 | ||||
| -rw-r--r-- | net/can/j1939/j1939-priv.h | 1 | ||||
| -rw-r--r-- | net/can/j1939/main.c | 3 | ||||
| -rw-r--r-- | net/can/j1939/socket.c | 52 | ||||
| -rw-r--r-- | net/core/dev_ioctl.c | 22 | ||||
| -rw-r--r-- | net/hsr/hsr_device.c | 28 | ||||
| -rw-r--r-- | net/hsr/hsr_main.c | 4 | ||||
| -rw-r--r-- | net/hsr/hsr_main.h | 3 | ||||
| -rw-r--r-- | net/ipv4/ip_tunnel_core.c | 6 | ||||
| -rw-r--r-- | net/ipv4/tcp_bpf.c | 5 | ||||
| -rw-r--r-- | net/mptcp/sockopt.c | 11 | ||||
| -rw-r--r-- | net/netfilter/nf_tables_api.c | 66 | ||||
| -rw-r--r-- | net/netfilter/nft_lookup.c | 46 | ||||
| -rw-r--r-- | net/netfilter/nft_set_bitmap.c | 3 | ||||
| -rw-r--r-- | net/netfilter/nft_set_pipapo.c | 20 | ||||
| -rw-r--r-- | net/netfilter/nft_set_pipapo_avx2.c | 3 | ||||
| -rw-r--r-- | net/netfilter/nft_set_rbtree.c | 6 | ||||
| -rw-r--r-- | net/netlink/genetlink.c | 3 | ||||
| -rw-r--r-- | net/sunrpc/sched.c | 2 | ||||
| -rw-r--r-- | net/sunrpc/xprtsock.c | 6 | ||||
| -rw-r--r-- | net/wireless/nl80211.c | 13 | ||||
| -rw-r--r-- | net/xdp/xsk.c | 113 | ||||
| -rw-r--r-- | net/xdp/xsk_queue.h | 12 | 
24 files changed, 346 insertions, 94 deletions
| diff --git a/net/bridge/br.c b/net/bridge/br.c index 1885d0c315f0..c683baa3847f 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br,  	int err = 0;  	int opt_id; +	opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX); +	if (opt_id != BITS_PER_LONG) { +		NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d", +				       opt_id); +		return -EINVAL; +	} +  	for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) {  		bool on = !!(bm->optval & BIT(opt_id)); diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 39844f14eed8..797719cb227e 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa)  	if (!ecu)  		ecu = j1939_ecu_create_locked(priv, name);  	err = PTR_ERR_OR_ZERO(ecu); -	if (err) +	if (err) { +		if (j1939_address_is_unicast(sa)) +			priv->ents[sa].nusers--;  		goto done; +	}  	ecu->nusers++;  	/* TODO: do we care if ecu->addr != sa? */ diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 31a93cae5111..81f58924b4ac 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv);  /* notify/alert all j1939 sockets bound to ifindex */  void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv);  int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk);  void j1939_tp_init(struct j1939_priv *priv); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 7e8a20f2fc42..3706a872ecaf 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb,  		j1939_sk_netdev_event_netdown(priv);  		j1939_ecu_unmap_all(priv);  		break; +	case NETDEV_UNREGISTER: +		j1939_sk_netdev_event_unregister(priv); +		break;  	}  	j1939_priv_put(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 3d8b588822f9..88e7160d4248 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)  	ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa);  	if (ret) {  		j1939_netdev_stop(priv); +		jsk->priv = NULL; +		synchronize_rcu(); +		j1939_priv_put(priv);  		goto out_release_sock;  	} @@ -1300,6 +1303,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)  	read_unlock_bh(&priv->j1939_socks_lock);  } +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) +{ +	struct sock *sk; +	struct j1939_sock *jsk; +	bool wait_rcu = false; + +rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ +	read_lock_bh(&priv->j1939_socks_lock); +	list_for_each_entry(jsk, &priv->j1939_socks, list) { +		/* Skip if j1939_jsk_add() is not called on this socket. */ +		if (!(jsk->state & J1939_SOCK_BOUND)) +			continue; +		sk = &jsk->sk; +		sock_hold(sk); +		read_unlock_bh(&priv->j1939_socks_lock); +		/* Check if j1939_jsk_del() is not yet called on this socket after holding +		 * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call +		 * j1939_jsk_del() with socket's lock held. +		 */ +		lock_sock(sk); +		if (jsk->state & J1939_SOCK_BOUND) { +			/* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). +			 * Make this socket no longer bound, by pretending as if j1939_sk_bind() +			 * dropped old references but did not get new references. +			 */ +			j1939_jsk_del(priv, jsk); +			j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); +			j1939_netdev_stop(priv); +			/* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from +			 * calling the corresponding j1939_priv_put(). +			 * +			 * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after +			 * an RCU grace period. But since the caller is holding a ref on this +			 * "priv", we can defer synchronize_rcu() until immediately before +			 * the caller calls j1939_priv_put(). +			 */ +			j1939_priv_put(priv); +			jsk->priv = NULL; +			wait_rcu = true; +		} +		release_sock(sk); +		sock_put(sk); +		goto rescan; +	} +	read_unlock_bh(&priv->j1939_socks_lock); +	if (wait_rcu) +		synchronize_rcu(); +} +  static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd,  				unsigned long arg)  { diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d8..ad54b12d4b4c 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev,  	if (!netif_device_present(dev))  		return -ENODEV; -	if (ops->ndo_hwtstamp_get) -		return dev_get_hwtstamp_phylib(dev, kernel_cfg); +	if (ops->ndo_hwtstamp_get) { +		int err; + +		netdev_lock_ops(dev); +		err = dev_get_hwtstamp_phylib(dev, kernel_cfg); +		netdev_unlock_ops(dev); + +		return err; +	}  	/* Legacy path: unconverted lower driver */  	return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev,  	if (!netif_device_present(dev))  		return -ENODEV; -	if (ops->ndo_hwtstamp_set) -		return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); +	if (ops->ndo_hwtstamp_set) { +		int err; + +		netdev_lock_ops(dev); +		err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); +		netdev_unlock_ops(dev); + +		return err; +	}  	/* Legacy path: unconverted lower driver */  	return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 88657255fec1..fbbc3ccf9df6 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master)  	ASSERT_RTNL(); -	hsr_for_each_port(master->hsr, port) { +	hsr_for_each_port_rtnl(master->hsr, port) {  		if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) {  			netif_carrier_on(master->dev);  			return true; @@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr)  	struct hsr_port *port;  	mtu_max = ETH_DATA_LEN; -	hsr_for_each_port(hsr, port) +	hsr_for_each_port_rtnl(hsr, port)  		if (port->type != HSR_PT_MASTER)  			mtu_max = min(port->dev->mtu, mtu_max); @@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev)  	hsr = netdev_priv(dev); -	hsr_for_each_port(hsr, port) { +	hsr_for_each_port_rtnl(hsr, port) {  		if (port->type == HSR_PT_MASTER)  			continue;  		switch (port->type) { @@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev)  	struct hsr_priv *hsr;  	hsr = netdev_priv(dev); -	hsr_for_each_port(hsr, port) { +	hsr_for_each_port_rtnl(hsr, port) {  		if (port->type == HSR_PT_MASTER)  			continue;  		switch (port->type) { @@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,  	 * may become enabled.  	 */  	features &= ~NETIF_F_ONE_FOR_ALL; -	hsr_for_each_port(hsr, port) +	hsr_for_each_port_rtnl(hsr, port)  		features = netdev_increment_features(features,  						     port->dev->features,  						     mask); @@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)  	struct hsr_priv *hsr = netdev_priv(dev);  	struct hsr_port *master; +	rcu_read_lock();  	master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);  	if (master) {  		skb->dev = master->dev; @@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)  		dev_core_stats_tx_dropped_inc(dev);  		dev_kfree_skb_any(skb);  	} +	rcu_read_unlock(); +  	return NETDEV_TX_OK;  } @@ -484,7 +487,7 @@ static void hsr_set_rx_mode(struct net_device *dev)  	hsr = netdev_priv(dev); -	hsr_for_each_port(hsr, port) { +	hsr_for_each_port_rtnl(hsr, port) {  		if (port->type == HSR_PT_MASTER)  			continue;  		switch (port->type) { @@ -506,7 +509,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change)  	hsr = netdev_priv(dev); -	hsr_for_each_port(hsr, port) { +	hsr_for_each_port_rtnl(hsr, port) {  		if (port->type == HSR_PT_MASTER)  			continue;  		switch (port->type) { @@ -534,7 +537,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev,  	hsr = netdev_priv(dev); -	hsr_for_each_port(hsr, port) { +	hsr_for_each_port_rtnl(hsr, port) {  		if (port->type == HSR_PT_MASTER ||  		    port->type == HSR_PT_INTERLINK)  			continue; @@ -580,7 +583,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev,  	hsr = netdev_priv(dev); -	hsr_for_each_port(hsr, port) { +	hsr_for_each_port_rtnl(hsr, port) {  		switch (port->type) {  		case HSR_PT_SLAVE_A:  		case HSR_PT_SLAVE_B: @@ -672,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev,  	struct hsr_priv *hsr = netdev_priv(ndev);  	struct hsr_port *port; +	rcu_read_lock();  	hsr_for_each_port(hsr, port) -		if (port->type == pt) +		if (port->type == pt) { +			dev_hold(port->dev); +			rcu_read_unlock();  			return port->dev; +		} +	rcu_read_unlock();  	return NULL;  }  EXPORT_SYMBOL(hsr_get_port_ndev); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 192893c3f2ec..bc94b07101d8 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr)  {  	struct hsr_port *port; -	hsr_for_each_port(hsr, port) +	hsr_for_each_port_rtnl(hsr, port)  		if (port->type != HSR_PT_MASTER)  			return false;  	return true; @@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)  {  	struct hsr_port *port; -	hsr_for_each_port(hsr, port) +	hsr_for_each_port_rtnl(hsr, port)  		if (port->type == pt)  			return port;  	return NULL; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 135ec5fce019..33b0d2460c9b 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -224,6 +224,9 @@ struct hsr_priv {  #define hsr_for_each_port(hsr, port) \  	list_for_each_entry_rcu((port), &(hsr)->ports, port_list) +#define hsr_for_each_port_rtnl(hsr, port) \ +	list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held()) +  struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt);  /* Caller must ensure skb is a valid HSR frame */ diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc9915543637..2e61ac137128 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)  	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))  		return -EINVAL; +	if (skb_is_gso(skb)) +		skb_gso_reset(skb); +  	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);  	pskb_pull(skb, ETH_HLEN);  	skb_reset_network_header(skb); @@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)  	if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))  		return -EINVAL; +	if (skb_is_gso(skb)) +		skb_gso_reset(skb); +  	skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);  	pskb_pull(skb, ETH_HLEN);  	skb_reset_network_header(skb); diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ba581785adb4..a268e1595b22 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -408,8 +408,11 @@ more_data:  		if (!psock->cork) {  			psock->cork = kzalloc(sizeof(*psock->cork),  					      GFP_ATOMIC | __GFP_NOWARN); -			if (!psock->cork) +			if (!psock->cork) { +				sk_msg_free(sk, msg); +				*copied = 0;  				return -ENOMEM; +			}  		}  		memcpy(psock->cork, msg, sizeof(*msg));  		return 0; diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2c267aff95be..2abe6f1e9940 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -1532,13 +1532,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)  {  	static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;  	struct sock *sk = (struct sock *)msk; +	bool keep_open; -	if (ssk->sk_prot->keepalive) { -		if (sock_flag(sk, SOCK_KEEPOPEN)) -			ssk->sk_prot->keepalive(ssk, 1); -		else -			ssk->sk_prot->keepalive(ssk, 0); -	} +	keep_open = sock_flag(sk, SOCK_KEEPOPEN); +	if (ssk->sk_prot->keepalive) +		ssk->sk_prot->keepalive(ssk, keep_open); +	sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open);  	ssk->sk_priority = sk->sk_priority;  	ssk->sk_bound_dev_if = sk->sk_bound_dev_if; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 68e273d8821a..eed434e0a970 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1123,11 +1123,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,  	return ERR_PTR(-ENOENT);  } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net)  { -	struct nftables_pernet *nft_net = nft_pernet(net); +	return READ_ONCE(net->nft.base_seq); +} -	return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ +	return htons(nft_base_seq(net) & 0xffff);  }  static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1147,7 +1150,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,  	nlh = nfnl_msg_put(skb, portid, seq,  			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), -			   flags, family, NFNETLINK_V0, nft_base_seq(net)); +			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));  	if (!nlh)  		goto nla_put_failure; @@ -1240,7 +1243,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,  	rcu_read_lock();  	nft_net = nft_pernet(net); -	cb->seq = READ_ONCE(nft_net->base_seq); +	cb->seq = nft_base_seq(net);  	list_for_each_entry_rcu(table, &nft_net->tables, list) {  		if (family != NFPROTO_UNSPEC && family != table->family) @@ -2022,7 +2025,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,  	nlh = nfnl_msg_put(skb, portid, seq,  			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), -			   flags, family, NFNETLINK_V0, nft_base_seq(net)); +			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));  	if (!nlh)  		goto nla_put_failure; @@ -2125,7 +2128,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,  	rcu_read_lock();  	nft_net = nft_pernet(net); -	cb->seq = READ_ONCE(nft_net->base_seq); +	cb->seq = nft_base_seq(net);  	list_for_each_entry_rcu(table, &nft_net->tables, list) {  		if (family != NFPROTO_UNSPEC && family != table->family) @@ -3663,7 +3666,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,  	u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);  	nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, -			   nft_base_seq(net)); +			   nft_base_seq_be16(net));  	if (!nlh)  		goto nla_put_failure; @@ -3831,7 +3834,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,  	rcu_read_lock();  	nft_net = nft_pernet(net); -	cb->seq = READ_ONCE(nft_net->base_seq); +	cb->seq = nft_base_seq(net);  	list_for_each_entry_rcu(table, &nft_net->tables, list) {  		if (family != NFPROTO_UNSPEC && family != table->family) @@ -4042,7 +4045,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb,  	buf = kasprintf(GFP_ATOMIC, "%.*s:%u",  			nla_len(nla[NFTA_RULE_TABLE]),  			(char *)nla_data(nla[NFTA_RULE_TABLE]), -			nft_net->base_seq); +			nft_base_seq(net));  	audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,  			AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);  	kfree(buf); @@ -4879,7 +4882,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,  	nlh = nfnl_msg_put(skb, portid, seq,  			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),  			   flags, ctx->family, NFNETLINK_V0, -			   nft_base_seq(ctx->net)); +			   nft_base_seq_be16(ctx->net));  	if (!nlh)  		goto nla_put_failure; @@ -5024,7 +5027,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)  	rcu_read_lock();  	nft_net = nft_pernet(net); -	cb->seq = READ_ONCE(nft_net->base_seq); +	cb->seq = nft_base_seq(net);  	list_for_each_entry_rcu(table, &nft_net->tables, list) {  		if (ctx->family != NFPROTO_UNSPEC && @@ -6201,7 +6204,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)  	rcu_read_lock();  	nft_net = nft_pernet(net); -	cb->seq = READ_ONCE(nft_net->base_seq); +	cb->seq = nft_base_seq(net);  	list_for_each_entry_rcu(table, &nft_net->tables, list) {  		if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6230,7 +6233,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)  	seq    = cb->nlh->nlmsg_seq;  	nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, -			   table->family, NFNETLINK_V0, nft_base_seq(net)); +			   table->family, NFNETLINK_V0, nft_base_seq_be16(net));  	if (!nlh)  		goto nla_put_failure; @@ -6323,7 +6326,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,  	event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);  	nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, -			   NFNETLINK_V0, nft_base_seq(ctx->net)); +			   NFNETLINK_V0, nft_base_seq_be16(ctx->net));  	if (!nlh)  		goto nla_put_failure; @@ -6622,7 +6625,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb,  		}  		nelems++;  	} -	audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); +	audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems);  out_unlock:  	rcu_read_unlock(); @@ -8372,7 +8375,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,  	nlh = nfnl_msg_put(skb, portid, seq,  			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), -			   flags, family, NFNETLINK_V0, nft_base_seq(net)); +			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));  	if (!nlh)  		goto nla_put_failure; @@ -8437,7 +8440,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)  	rcu_read_lock();  	nft_net = nft_pernet(net); -	cb->seq = READ_ONCE(nft_net->base_seq); +	cb->seq = nft_base_seq(net);  	list_for_each_entry_rcu(table, &nft_net->tables, list) {  		if (family != NFPROTO_UNSPEC && family != table->family) @@ -8471,7 +8474,7 @@ cont:  			idx++;  		}  		if (ctx->reset && entries) -			audit_log_obj_reset(table, nft_net->base_seq, entries); +			audit_log_obj_reset(table, nft_base_seq(net), entries);  		if (rc < 0)  			break;  	} @@ -8640,7 +8643,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb,  	buf = kasprintf(GFP_ATOMIC, "%.*s:%u",  			nla_len(nla[NFTA_OBJ_TABLE]),  			(char *)nla_data(nla[NFTA_OBJ_TABLE]), -			nft_net->base_seq); +			nft_base_seq(net));  	audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,  			AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);  	kfree(buf); @@ -8745,9 +8748,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,  		    struct nft_object *obj, u32 portid, u32 seq, int event,  		    u16 flags, int family, int report, gfp_t gfp)  { -	struct nftables_pernet *nft_net = nft_pernet(net);  	char *buf = kasprintf(gfp, "%s:%u", -			      table->name, nft_net->base_seq); +			      table->name, nft_base_seq(net));  	audit_log_nfcfg(buf,  			family, @@ -9433,7 +9435,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,  	nlh = nfnl_msg_put(skb, portid, seq,  			   nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), -			   flags, family, NFNETLINK_V0, nft_base_seq(net)); +			   flags, family, NFNETLINK_V0, nft_base_seq_be16(net));  	if (!nlh)  		goto nla_put_failure; @@ -9502,7 +9504,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,  	rcu_read_lock();  	nft_net = nft_pernet(net); -	cb->seq = READ_ONCE(nft_net->base_seq); +	cb->seq = nft_base_seq(net);  	list_for_each_entry_rcu(table, &nft_net->tables, list) {  		if (family != NFPROTO_UNSPEC && family != table->family) @@ -9687,17 +9689,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)  static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,  				   u32 portid, u32 seq)  { -	struct nftables_pernet *nft_net = nft_pernet(net);  	struct nlmsghdr *nlh;  	char buf[TASK_COMM_LEN];  	int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);  	nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, -			   NFNETLINK_V0, nft_base_seq(net)); +			   NFNETLINK_V0, nft_base_seq_be16(net));  	if (!nlh)  		goto nla_put_failure; -	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || +	if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) ||  	    nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||  	    nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))  		goto nla_put_failure; @@ -10959,11 +10960,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)  	 * Bump generation counter, invalidate any dump in progress.  	 * Cannot fail after this point.  	 */ -	base_seq = READ_ONCE(nft_net->base_seq); +	base_seq = nft_base_seq(net);  	while (++base_seq == 0)  		; -	WRITE_ONCE(nft_net->base_seq, base_seq); +	/* pairs with smp_load_acquire in nft_lookup_eval */ +	smp_store_release(&net->nft.base_seq, base_seq);  	gc_seq = nft_gc_seq_begin(nft_net); @@ -11172,7 +11174,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)  	nft_commit_notify(net, NETLINK_CB(skb).portid);  	nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); -	nf_tables_commit_audit_log(&adl, nft_net->base_seq); +	nf_tables_commit_audit_log(&adl, nft_base_seq(net));  	nft_gc_seq_end(nft_net, gc_seq);  	nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11497,7 +11499,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid)  	mutex_lock(&nft_net->commit_mutex);  	nft_net->tstamp = get_jiffies_64(); -	genid_ok = genid == 0 || nft_net->base_seq == genid; +	genid_ok = genid == 0 || nft_base_seq(net) == genid;  	if (!genid_ok)  		mutex_unlock(&nft_net->commit_mutex); @@ -12134,7 +12136,7 @@ static int __net_init nf_tables_init_net(struct net *net)  	INIT_LIST_HEAD(&nft_net->module_list);  	INIT_LIST_HEAD(&nft_net->notify_list);  	mutex_init(&nft_net->commit_mutex); -	nft_net->base_seq = 1; +	net->nft.base_seq = 1;  	nft_net->gc_seq = 0;  	nft_net->validate_state = NFT_VALIDATE_SKIP;  	INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba..58c5b14889c4 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup {  	struct nft_set_binding		binding;  }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, -		  const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, +		    const u32 *key)  { +#ifdef CONFIG_MITIGATION_RETPOLINE  	if (set->ops == &nft_set_hash_fast_type.ops)  		return nft_hash_lookup_fast(net, set, key);  	if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,46 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set,  		return nft_rbtree_lookup(net, set, key);  	WARN_ON_ONCE(1); +#endif  	return set->ops->lookup(net, set, key);  } + +static unsigned int nft_base_seq(const struct net *net) +{ +	/* pairs with smp_store_release() in nf_tables_commit() */ +	return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ +	return unlikely(seq != nft_base_seq(net)); +} + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, +		  const u32 *key) +{ +	const struct nft_set_ext *ext; +	unsigned int base_seq; + +	do { +		base_seq = nft_base_seq(net); + +		ext = __nft_set_do_lookup(net, set, key); +		if (ext) +			break; +		/* No match?  There is a small chance that lookup was +		 * performed in the old generation, but nf_tables_commit() +		 * already unlinked a (matching) element. +		 * +		 * We need to repeat the lookup to make sure that we didn't +		 * miss a matching element in the new generation. +		 */ +	} while (nft_lookup_should_retry(net, base_seq)); + +	return ext; +}  EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif  void nft_lookup_eval(const struct nft_expr *expr,  		     struct nft_regs *regs, diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index c24c922f895d..8d3f040a904a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,  	const struct nft_bitmap *priv = nft_set_priv(set);  	struct nft_bitmap_elem *be; -	list_for_each_entry_rcu(be, &priv->list, head) { +	list_for_each_entry_rcu(be, &priv->list, head, +				lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {  		if (iter->count < iter->skip)  			goto cont; diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 4b64c3bd8e70..a7b8fa8cab7c 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -549,6 +549,23 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,   *   * This function is called from the data path.  It will search for   * an element matching the given key in the current active copy. + * Unlike other set types, this uses NFT_GENMASK_ANY instead of + * nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK will still find the 'now old' entries which ensures consistent + * priv->match view. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip.  As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones.   *   * Return: ntables API extension pointer or NULL if no match.   */ @@ -557,12 +574,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set,  		  const u32 *key)  {  	struct nft_pipapo *priv = nft_set_priv(set); -	u8 genmask = nft_genmask_cur(net);  	const struct nft_pipapo_match *m;  	const struct nft_pipapo_elem *e;  	m = rcu_dereference(priv->match); -	e = pipapo_get_slow(m, (const u8 *)key, genmask, get_jiffies_64()); +	e = pipapo_get_slow(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64());  	return e ? &e->ext : NULL;  } diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 7559306d0aed..27dab3667548 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1275,7 +1275,6 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,  		       const u32 *key)  {  	struct nft_pipapo *priv = nft_set_priv(set); -	u8 genmask = nft_genmask_cur(net);  	const struct nft_pipapo_match *m;  	const u8 *rp = (const u8 *)key;  	const struct nft_pipapo_elem *e; @@ -1293,7 +1292,7 @@ nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,  	m = rcu_dereference(priv->match); -	e = pipapo_get_avx2(m, rp, genmask, get_jiffies_64()); +	e = pipapo_get_avx2(m, rp, NFT_GENMASK_ANY, get_jiffies_64());  	local_bh_enable();  	return e ? &e->ext : NULL; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index b311b66df3e9..ca594161b840 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,  			    nft_rbtree_interval_end(rbe) &&  			    nft_rbtree_interval_start(interval))  				continue; -			interval = rbe; +			if (nft_set_elem_active(&rbe->ext, genmask) && +			    !nft_rbtree_elem_expired(rbe)) +				interval = rbe;  		} else if (d > 0)  			parent = rcu_dereference_raw(parent->rb_right);  		else { @@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,  	}  	if (set->flags & NFT_SET_INTERVAL && interval != NULL && -	    nft_set_elem_active(&interval->ext, genmask) && -	    !nft_rbtree_elem_expired(interval) &&  	    nft_rbtree_interval_start(interval))  		return &interval->ext; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 104732d34543..978c129c6095 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group)  		    !ns_capable(net->user_ns, CAP_SYS_ADMIN))  			ret = -EPERM; +		if (ret) +			break; +  		if (family->bind)  			family->bind(i); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 73bc39281ef5..9b45fbdc90ca 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);  static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)  { -	if (unlikely(current->flags & PF_EXITING)) -		return -EINTR;  	schedule();  	if (signal_pending_state(mode, current))  		return -ERESTARTSYS; diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c5f7bbf5775f..3aa987e7f072 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags)  	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,  		      alert_kvec.iov_len);  	ret = sock_recvmsg(sock, &msg, flags); -	if (ret > 0 && -	    tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { -		iov_iter_revert(&msg.msg_iter, ret); +	if (ret > 0) { +		if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) +			iov_iter_revert(&msg.msg_iter, ret);  		ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg,  					   -EAGAIN);  	} diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..852573423e52 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7062,7 +7062,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,  				u32 seq, int flags,  				struct cfg80211_registered_device *rdev,  				struct net_device *dev, -				const u8 *mac_addr, struct station_info *sinfo) +				const u8 *mac_addr, struct station_info *sinfo, +				bool link_stats)  {  	void *hdr;  	struct nlattr *sinfoattr, *bss_param; @@ -7283,7 +7284,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid,  			goto nla_put_failure;  	} -	if (sinfo->valid_links) { +	if (link_stats && sinfo->valid_links) {  		links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS);  		if (!links)  			goto nla_put_failure; @@ -7574,7 +7575,7 @@ static int nl80211_dump_station(struct sk_buff *skb,  				NETLINK_CB(cb->skb).portid,  				cb->nlh->nlmsg_seq, NLM_F_MULTI,  				rdev, wdev->netdev, mac_addr, -				&sinfo) < 0) +				&sinfo, false) < 0)  			goto out;  		sta_idx++; @@ -7635,7 +7636,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info)  	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION,  				 info->snd_portid, info->snd_seq, 0, -				 rdev, dev, mac_addr, &sinfo) < 0) { +				 rdev, dev, mac_addr, &sinfo, false) < 0) {  		nlmsg_free(msg);  		return -ENOBUFS;  	} @@ -19680,7 +19681,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr,  		return;  	if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, -				 rdev, dev, mac_addr, sinfo) < 0) { +				 rdev, dev, mac_addr, sinfo, false) < 0) {  		nlmsg_free(msg);  		return;  	} @@ -19710,7 +19711,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr,  	}  	if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, -				 rdev, dev, mac_addr, sinfo) < 0) { +				 rdev, dev, mac_addr, sinfo, false) < 0) {  		nlmsg_free(msg);  		return;  	} diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c3acecc14b1..72e34bd2d925 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,20 @@  #define TX_BATCH_SIZE 32  #define MAX_PER_SOCKET_BUDGET 32 +struct xsk_addr_node { +	u64 addr; +	struct list_head addr_node; +}; + +struct xsk_addr_head { +	u32 num_descs; +	struct list_head addrs_list; +}; + +static struct kmem_cache *xsk_tx_generic_cache; + +#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb)) +  void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)  {  	if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -532,24 +546,43 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)  	return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);  } -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)  {  	unsigned long flags;  	int ret;  	spin_lock_irqsave(&pool->cq_lock, flags); -	ret = xskq_prod_reserve_addr(pool->cq, addr); +	ret = xskq_prod_reserve(pool->cq);  	spin_unlock_irqrestore(&pool->cq_lock, flags);  	return ret;  } -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, +				      struct sk_buff *skb)  { +	struct xsk_addr_node *pos, *tmp; +	u32 descs_processed = 0;  	unsigned long flags; +	u32 idx;  	spin_lock_irqsave(&pool->cq_lock, flags); -	xskq_prod_submit_n(pool->cq, n); +	idx = xskq_get_prod(pool->cq); + +	xskq_prod_write_addr(pool->cq, idx, +			     (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg); +	descs_processed++; + +	if (unlikely(XSKCB(skb)->num_descs > 1)) { +		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { +			xskq_prod_write_addr(pool->cq, idx + descs_processed, +					     pos->addr); +			descs_processed++; +			list_del(&pos->addr_node); +			kmem_cache_free(xsk_tx_generic_cache, pos); +		} +	} +	xskq_prod_submit_n(pool->cq, descs_processed);  	spin_unlock_irqrestore(&pool->cq_lock, flags);  } @@ -562,9 +595,14 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)  	spin_unlock_irqrestore(&pool->cq_lock, flags);  } +static void xsk_inc_num_desc(struct sk_buff *skb) +{ +	XSKCB(skb)->num_descs++; +} +  static u32 xsk_get_num_desc(struct sk_buff *skb)  { -	return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; +	return XSKCB(skb)->num_descs;  }  static void xsk_destruct_skb(struct sk_buff *skb) @@ -576,23 +614,33 @@ static void xsk_destruct_skb(struct sk_buff *skb)  		*compl->tx_timestamp = ktime_get_tai_fast_ns();  	} -	xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); +	xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);  	sock_wfree(skb);  } -static void xsk_set_destructor_arg(struct sk_buff *skb) +static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr)  { -	long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; - -	skb_shinfo(skb)->destructor_arg = (void *)num; +	BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); +	INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); +	XSKCB(skb)->num_descs = 0; +	skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;  }  static void xsk_consume_skb(struct sk_buff *skb)  {  	struct xdp_sock *xs = xdp_sk(skb->sk); +	u32 num_descs = xsk_get_num_desc(skb); +	struct xsk_addr_node *pos, *tmp; + +	if (unlikely(num_descs > 1)) { +		list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) { +			list_del(&pos->addr_node); +			kmem_cache_free(xsk_tx_generic_cache, pos); +		} +	}  	skb->destructor = sock_wfree; -	xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); +	xsk_cq_cancel_locked(xs->pool, num_descs);  	/* Free skb without triggering the perf drop trace */  	consume_skb(skb);  	xs->skb = NULL; @@ -609,6 +657,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,  {  	struct xsk_buff_pool *pool = xs->pool;  	u32 hr, len, ts, offset, copy, copied; +	struct xsk_addr_node *xsk_addr;  	struct sk_buff *skb = xs->skb;  	struct page *page;  	void *buffer; @@ -623,6 +672,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,  			return ERR_PTR(err);  		skb_reserve(skb, hr); + +		xsk_set_destructor_arg(skb, desc->addr); +	} else { +		xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); +		if (!xsk_addr) +			return ERR_PTR(-ENOMEM); + +		/* in case of -EOVERFLOW that could happen below, +		 * xsk_consume_skb() will release this node as whole skb +		 * would be dropped, which implies freeing all list elements +		 */ +		xsk_addr->addr = desc->addr; +		list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);  	}  	addr = desc->addr; @@ -694,8 +756,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,  			err = skb_store_bits(skb, 0, buffer, len);  			if (unlikely(err))  				goto free_err; + +			xsk_set_destructor_arg(skb, desc->addr);  		} else {  			int nr_frags = skb_shinfo(skb)->nr_frags; +			struct xsk_addr_node *xsk_addr;  			struct page *page;  			u8 *vaddr; @@ -710,12 +775,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,  				goto free_err;  			} +			xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); +			if (!xsk_addr) { +				__free_page(page); +				err = -ENOMEM; +				goto free_err; +			} +  			vaddr = kmap_local_page(page);  			memcpy(vaddr, buffer, len);  			kunmap_local(vaddr);  			skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);  			refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); + +			xsk_addr->addr = desc->addr; +			list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);  		}  		if (first_frag && desc->options & XDP_TX_METADATA) { @@ -759,7 +834,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,  	skb->mark = READ_ONCE(xs->sk.sk_mark);  	skb->destructor = xsk_destruct_skb;  	xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); -	xsk_set_destructor_arg(skb); +	xsk_inc_num_desc(skb);  	return skb; @@ -769,7 +844,7 @@ free_err:  	if (err == -EOVERFLOW) {  		/* Drop the packet */ -		xsk_set_destructor_arg(xs->skb); +		xsk_inc_num_desc(xs->skb);  		xsk_drop_skb(xs->skb);  		xskq_cons_release(xs->tx);  	} else { @@ -812,7 +887,7 @@ static int __xsk_generic_xmit(struct sock *sk)  		 * if there is space in it. This avoids having to implement  		 * any buffering in the Tx path.  		 */ -		err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); +		err = xsk_cq_reserve_locked(xs->pool);  		if (err) {  			err = -EAGAIN;  			goto out; @@ -1815,8 +1890,18 @@ static int __init xsk_init(void)  	if (err)  		goto out_pernet; +	xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", +						 sizeof(struct xsk_addr_node), +						 0, SLAB_HWCACHE_ALIGN, NULL); +	if (!xsk_tx_generic_cache) { +		err = -ENOMEM; +		goto out_unreg_notif; +	} +  	return 0; +out_unreg_notif: +	unregister_netdevice_notifier(&xsk_netdev_notifier);  out_pernet:  	unregister_pernet_subsys(&xsk_net_ops);  out_sk: diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 46d87e961ad6..f16f390370dc 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)  /* Functions for producers */ +static inline u32 xskq_get_prod(struct xsk_queue *q) +{ +	return READ_ONCE(q->ring->producer); +} +  static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)  {  	u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)  	return 0;  } +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr) +{ +	struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + +	ring->desc[idx & q->ring_mask] = addr; +} +  static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,  					      u32 nb_entries)  { | 
