summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>2025-09-19 16:37:39 +0200
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2025-09-19 16:37:39 +0200
commit362f34fba309536598a02bc0541ccaeae2bc9052 (patch)
tree909959d943cf9f3afe80ba67333359828660e27c /net
parente4c42304a8cd3b7d0b9755e80dd6835c20b8e0dc (diff)
parent62dae019823123ce3baa50e680219e2beb9a63a5 (diff)
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Diffstat (limited to 'net')
-rw-r--r--net/bluetooth/hci_conn.c14
-rw-r--r--net/bluetooth/hci_event.c7
-rw-r--r--net/bluetooth/iso.c2
-rw-r--r--net/bridge/br.c7
-rw-r--r--net/can/j1939/bus.c5
-rw-r--r--net/can/j1939/j1939-priv.h1
-rw-r--r--net/can/j1939/main.c3
-rw-r--r--net/can/j1939/socket.c52
-rw-r--r--net/ceph/messenger.c7
-rw-r--r--net/core/dev_ioctl.c22
-rw-r--r--net/hsr/hsr_device.c28
-rw-r--r--net/hsr/hsr_main.c4
-rw-r--r--net/hsr/hsr_main.h3
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/tcp_bpf.c5
-rw-r--r--net/mptcp/sockopt.c11
-rw-r--r--net/netfilter/nf_tables_api.c123
-rw-r--r--net/netfilter/nft_dynset.c5
-rw-r--r--net/netfilter/nft_lookup.c67
-rw-r--r--net/netfilter/nft_objref.c5
-rw-r--r--net/netfilter/nft_set_bitmap.c14
-rw-r--r--net/netfilter/nft_set_hash.c54
-rw-r--r--net/netfilter/nft_set_pipapo.c214
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c27
-rw-r--r--net/netfilter/nft_set_rbtree.c46
-rw-r--r--net/netlink/genetlink.c3
-rw-r--r--net/sunrpc/sched.c2
-rw-r--r--net/sunrpc/xprtsock.c6
-rw-r--r--net/xdp/xsk.c113
-rw-r--r--net/xdp/xsk_queue.h12
30 files changed, 547 insertions, 321 deletions
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index ad5574e9a93e..ce17e489c67c 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -829,7 +829,17 @@ static void bis_cleanup(struct hci_conn *conn)
/* Check if ISO connection is a BIS and terminate advertising
* set and BIG if there are no other connections using it.
*/
- bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big);
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_CONNECTED,
+ HCI_ROLE_MASTER);
+ if (bis)
+ return;
+
+ bis = hci_conn_hash_lookup_big_state(hdev,
+ conn->iso_qos.bcast.big,
+ BT_CONNECT,
+ HCI_ROLE_MASTER);
if (bis)
return;
@@ -2274,7 +2284,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
* the start periodic advertising and create BIG commands have
* been queued
*/
- hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK,
+ hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK,
BT_BOUND, &data);
/* Queue start periodic advertising and create BIG */
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 0ffdbe249f5d..090c7ffa5152 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6973,9 +6973,14 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data,
continue;
}
- if (ev->status != 0x42)
+ if (ev->status != 0x42) {
/* Mark PA sync as established */
set_bit(HCI_CONN_PA_SYNC, &bis->flags);
+ /* Reset cleanup callback of PA Sync so it doesn't
+ * terminate the sync when deleting the connection.
+ */
+ conn->cleanup = NULL;
+ }
bis->sync_handle = conn->sync_handle;
bis->iso_qos.bcast.big = ev->handle;
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 14a4215352d5..c21566e1494a 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1347,7 +1347,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr,
bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst);
sa->iso_bdaddr_type = iso_pi(sk)->dst_type;
- if (hcon && hcon->type == BIS_LINK) {
+ if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) {
sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid;
sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis;
memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis,
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 0adeafe11a36..ad2d8f59fc7b 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -324,6 +324,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br,
int err = 0;
int opt_id;
+ opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX);
+ if (opt_id != BITS_PER_LONG) {
+ NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d",
+ opt_id);
+ return -EINVAL;
+ }
+
for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) {
bool on = !!(bm->optval & BIT(opt_id));
diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c
index 39844f14eed8..797719cb227e 100644
--- a/net/can/j1939/bus.c
+++ b/net/can/j1939/bus.c
@@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa)
if (!ecu)
ecu = j1939_ecu_create_locked(priv, name);
err = PTR_ERR_OR_ZERO(ecu);
- if (err)
+ if (err) {
+ if (j1939_address_is_unicast(sa))
+ priv->ents[sa].nusers--;
goto done;
+ }
ecu->nusers++;
/* TODO: do we care if ecu->addr != sa? */
diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h
index 31a93cae5111..81f58924b4ac 100644
--- a/net/can/j1939/j1939-priv.h
+++ b/net/can/j1939/j1939-priv.h
@@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv);
/* notify/alert all j1939 sockets bound to ifindex */
void j1939_sk_netdev_event_netdown(struct j1939_priv *priv);
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv);
int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk);
void j1939_tp_init(struct j1939_priv *priv);
diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c
index 7e8a20f2fc42..3706a872ecaf 100644
--- a/net/can/j1939/main.c
+++ b/net/can/j1939/main.c
@@ -377,6 +377,9 @@ static int j1939_netdev_notify(struct notifier_block *nb,
j1939_sk_netdev_event_netdown(priv);
j1939_ecu_unmap_all(priv);
break;
+ case NETDEV_UNREGISTER:
+ j1939_sk_netdev_event_unregister(priv);
+ break;
}
j1939_priv_put(priv);
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index 6fefe7a68761..785b883a1319 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -520,6 +520,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len)
ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa);
if (ret) {
j1939_netdev_stop(priv);
+ jsk->priv = NULL;
+ synchronize_rcu();
+ j1939_priv_put(priv);
goto out_release_sock;
}
@@ -1299,6 +1302,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv)
read_unlock_bh(&priv->j1939_socks_lock);
}
+void j1939_sk_netdev_event_unregister(struct j1939_priv *priv)
+{
+ struct sock *sk;
+ struct j1939_sock *jsk;
+ bool wait_rcu = false;
+
+rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */
+ read_lock_bh(&priv->j1939_socks_lock);
+ list_for_each_entry(jsk, &priv->j1939_socks, list) {
+ /* Skip if j1939_jsk_add() is not called on this socket. */
+ if (!(jsk->state & J1939_SOCK_BOUND))
+ continue;
+ sk = &jsk->sk;
+ sock_hold(sk);
+ read_unlock_bh(&priv->j1939_socks_lock);
+ /* Check if j1939_jsk_del() is not yet called on this socket after holding
+ * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call
+ * j1939_jsk_del() with socket's lock held.
+ */
+ lock_sock(sk);
+ if (jsk->state & J1939_SOCK_BOUND) {
+ /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del().
+ * Make this socket no longer bound, by pretending as if j1939_sk_bind()
+ * dropped old references but did not get new references.
+ */
+ j1939_jsk_del(priv, jsk);
+ j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa);
+ j1939_netdev_stop(priv);
+ /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from
+ * calling the corresponding j1939_priv_put().
+ *
+ * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after
+ * an RCU grace period. But since the caller is holding a ref on this
+ * "priv", we can defer synchronize_rcu() until immediately before
+ * the caller calls j1939_priv_put().
+ */
+ j1939_priv_put(priv);
+ jsk->priv = NULL;
+ wait_rcu = true;
+ }
+ release_sock(sk);
+ sock_put(sk);
+ goto rescan;
+ }
+ read_unlock_bh(&priv->j1939_socks_lock);
+ if (wait_rcu)
+ synchronize_rcu();
+}
+
static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d1b5705dc0c6..9f6d860411cb 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1524,7 +1524,7 @@ static void con_fault_finish(struct ceph_connection *con)
* in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones.
*/
- if (con->v1.auth_retry) {
+ if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) {
dout("auth_retry %d, invalidating\n", con->v1.auth_retry);
if (con->ops->invalidate_authorizer)
con->ops->invalidate_authorizer(con);
@@ -1714,9 +1714,10 @@ static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
if (con->state == CEPH_CON_S_STANDBY) {
- dout("clear_standby %p and ++connect_seq\n", con);
+ dout("clear_standby %p\n", con);
con->state = CEPH_CON_S_PREOPEN;
- con->v1.connect_seq++;
+ if (!ceph_msgr2(from_msgr(con->msgr)))
+ con->v1.connect_seq++;
WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING));
WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING));
}
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 616479e71466..9447065d01af 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev,
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_get)
- return dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ if (ops->ndo_hwtstamp_get) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
/* Legacy path: unconverted lower driver */
return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
@@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_set)
- return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ if (ops->ndo_hwtstamp_set) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
/* Legacy path: unconverted lower driver */
return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 88657255fec1..fbbc3ccf9df6 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master)
ASSERT_RTNL();
- hsr_for_each_port(master->hsr, port) {
+ hsr_for_each_port_rtnl(master->hsr, port) {
if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) {
netif_carrier_on(master->dev);
return true;
@@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr)
struct hsr_port *port;
mtu_max = ETH_DATA_LEN;
- hsr_for_each_port(hsr, port)
+ hsr_for_each_port_rtnl(hsr, port)
if (port->type != HSR_PT_MASTER)
mtu_max = min(port->dev->mtu, mtu_max);
@@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev)
hsr = netdev_priv(dev);
- hsr_for_each_port(hsr, port) {
+ hsr_for_each_port_rtnl(hsr, port) {
if (port->type == HSR_PT_MASTER)
continue;
switch (port->type) {
@@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev)
struct hsr_priv *hsr;
hsr = netdev_priv(dev);
- hsr_for_each_port(hsr, port) {
+ hsr_for_each_port_rtnl(hsr, port) {
if (port->type == HSR_PT_MASTER)
continue;
switch (port->type) {
@@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr,
* may become enabled.
*/
features &= ~NETIF_F_ONE_FOR_ALL;
- hsr_for_each_port(hsr, port)
+ hsr_for_each_port_rtnl(hsr, port)
features = netdev_increment_features(features,
port->dev->features,
mask);
@@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
struct hsr_priv *hsr = netdev_priv(dev);
struct hsr_port *master;
+ rcu_read_lock();
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
if (master) {
skb->dev = master->dev;
@@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev)
dev_core_stats_tx_dropped_inc(dev);
dev_kfree_skb_any(skb);
}
+ rcu_read_unlock();
+
return NETDEV_TX_OK;
}
@@ -484,7 +487,7 @@ static void hsr_set_rx_mode(struct net_device *dev)
hsr = netdev_priv(dev);
- hsr_for_each_port(hsr, port) {
+ hsr_for_each_port_rtnl(hsr, port) {
if (port->type == HSR_PT_MASTER)
continue;
switch (port->type) {
@@ -506,7 +509,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change)
hsr = netdev_priv(dev);
- hsr_for_each_port(hsr, port) {
+ hsr_for_each_port_rtnl(hsr, port) {
if (port->type == HSR_PT_MASTER)
continue;
switch (port->type) {
@@ -534,7 +537,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev,
hsr = netdev_priv(dev);
- hsr_for_each_port(hsr, port) {
+ hsr_for_each_port_rtnl(hsr, port) {
if (port->type == HSR_PT_MASTER ||
port->type == HSR_PT_INTERLINK)
continue;
@@ -580,7 +583,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev,
hsr = netdev_priv(dev);
- hsr_for_each_port(hsr, port) {
+ hsr_for_each_port_rtnl(hsr, port) {
switch (port->type) {
case HSR_PT_SLAVE_A:
case HSR_PT_SLAVE_B:
@@ -672,9 +675,14 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev,
struct hsr_priv *hsr = netdev_priv(ndev);
struct hsr_port *port;
+ rcu_read_lock();
hsr_for_each_port(hsr, port)
- if (port->type == pt)
+ if (port->type == pt) {
+ dev_hold(port->dev);
+ rcu_read_unlock();
return port->dev;
+ }
+ rcu_read_unlock();
return NULL;
}
EXPORT_SYMBOL(hsr_get_port_ndev);
diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c
index 192893c3f2ec..bc94b07101d8 100644
--- a/net/hsr/hsr_main.c
+++ b/net/hsr/hsr_main.c
@@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr)
{
struct hsr_port *port;
- hsr_for_each_port(hsr, port)
+ hsr_for_each_port_rtnl(hsr, port)
if (port->type != HSR_PT_MASTER)
return false;
return true;
@@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt)
{
struct hsr_port *port;
- hsr_for_each_port(hsr, port)
+ hsr_for_each_port_rtnl(hsr, port)
if (port->type == pt)
return port;
return NULL;
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index 135ec5fce019..33b0d2460c9b 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -224,6 +224,9 @@ struct hsr_priv {
#define hsr_for_each_port(hsr, port) \
list_for_each_entry_rcu((port), &(hsr)->ports, port_list)
+#define hsr_for_each_port_rtnl(hsr, port) \
+ list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held())
+
struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt);
/* Caller must ensure skb is a valid HSR frame */
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index f65d2f727381..8392d304a72e 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -204,6 +204,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr)))
return -EINVAL;
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
@@ -298,6 +301,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu)
if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr)))
return -EINVAL;
+ if (skb_is_gso(skb))
+ skb_gso_reset(skb);
+
skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
pskb_pull(skb, ETH_HLEN);
skb_reset_network_header(skb);
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index ba581785adb4..a268e1595b22 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -408,8 +408,11 @@ more_data:
if (!psock->cork) {
psock->cork = kzalloc(sizeof(*psock->cork),
GFP_ATOMIC | __GFP_NOWARN);
- if (!psock->cork)
+ if (!psock->cork) {
+ sk_msg_free(sk, msg);
+ *copied = 0;
return -ENOMEM;
+ }
}
memcpy(psock->cork, msg, sizeof(*msg));
return 0;
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 3caa0a9d3b38..25d2b65653cd 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1508,13 +1508,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
{
static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK;
struct sock *sk = (struct sock *)msk;
+ bool keep_open;
- if (ssk->sk_prot->keepalive) {
- if (sock_flag(sk, SOCK_KEEPOPEN))
- ssk->sk_prot->keepalive(ssk, 1);
- else
- ssk->sk_prot->keepalive(ssk, 0);
- }
+ keep_open = sock_flag(sk, SOCK_KEEPOPEN);
+ if (ssk->sk_prot->keepalive)
+ ssk->sk_prot->keepalive(ssk, keep_open);
+ sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open);
ssk->sk_priority = sk->sk_priority;
ssk->sk_bound_dev_if = sk->sk_bound_dev_if;
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 0e86434ca13b..cde63e5f18d8 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -1131,11 +1131,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
return ERR_PTR(-ENOENT);
}
-static __be16 nft_base_seq(const struct net *net)
+static unsigned int nft_base_seq(const struct net *net)
{
- struct nftables_pernet *nft_net = nft_pernet(net);
+ return READ_ONCE(net->nft.base_seq);
+}
- return htons(nft_net->base_seq & 0xffff);
+static __be16 nft_base_seq_be16(const struct net *net)
+{
+ return htons(nft_base_seq(net) & 0xffff);
}
static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = {
@@ -1153,9 +1156,9 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
{
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
@@ -1165,6 +1168,12 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net,
NFTA_TABLE_PAD))
goto nla_put_failure;
+ if (event == NFT_MSG_DELTABLE ||
+ event == NFT_MSG_DESTROYTABLE) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
if (nla_put_be32(skb, NFTA_TABLE_FLAGS,
htonl(table->flags & NFT_TABLE_F_MASK)))
goto nla_put_failure;
@@ -1242,7 +1251,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -2022,9 +2031,9 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
{
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
@@ -2034,6 +2043,13 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net,
NFTA_CHAIN_PAD))
goto nla_put_failure;
+ if (!hook_list &&
+ (event == NFT_MSG_DELCHAIN ||
+ event == NFT_MSG_DESTROYCHAIN)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
if (nft_is_base_chain(chain)) {
const struct nft_base_chain *basechain = nft_base_chain(chain);
struct nft_stats __percpu *stats;
@@ -2120,7 +2136,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -3658,7 +3674,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0,
- nft_base_seq(net));
+ nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
@@ -3826,7 +3842,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -4037,7 +4053,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb,
buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
nla_len(nla[NFTA_RULE_TABLE]),
(char *)nla_data(nla[NFTA_RULE_TABLE]),
- nft_net->base_seq);
+ nft_base_seq(net));
audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC);
kfree(buf);
@@ -4871,9 +4887,10 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
u32 seq = ctx->seq;
int i;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
- NFNETLINK_V0, nft_base_seq(ctx->net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, ctx->family, NFNETLINK_V0,
+ nft_base_seq_be16(ctx->net));
if (!nlh)
goto nla_put_failure;
@@ -4885,6 +4902,12 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
NFTA_SET_PAD))
goto nla_put_failure;
+ if (event == NFT_MSG_DELSET ||
+ event == NFT_MSG_DESTROYSET) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
if (set->flags != 0)
if (nla_put_be32(skb, NFTA_SET_FLAGS, htonl(set->flags)))
goto nla_put_failure;
@@ -5012,7 +5035,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (ctx->family != NFPROTO_UNSPEC &&
@@ -6189,7 +6212,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (dump_ctx->ctx.family != NFPROTO_UNSPEC &&
@@ -6218,7 +6241,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
seq = cb->nlh->nlmsg_seq;
nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI,
- table->family, NFNETLINK_V0, nft_base_seq(net));
+ table->family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
@@ -6311,7 +6334,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb,
event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family,
- NFNETLINK_V0, nft_base_seq(ctx->net));
+ NFNETLINK_V0, nft_base_seq_be16(ctx->net));
if (!nlh)
goto nla_put_failure;
@@ -6610,7 +6633,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb,
}
nelems++;
}
- audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems);
+ audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems);
out_unlock:
rcu_read_unlock();
@@ -8359,20 +8382,26 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net,
{
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
if (nla_put_string(skb, NFTA_OBJ_TABLE, table->name) ||
nla_put_string(skb, NFTA_OBJ_NAME, obj->key.name) ||
+ nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
nla_put_be64(skb, NFTA_OBJ_HANDLE, cpu_to_be64(obj->handle),
NFTA_OBJ_PAD))
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_OBJ_TYPE, htonl(obj->ops->type->type)) ||
- nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
+ if (event == NFT_MSG_DELOBJ ||
+ event == NFT_MSG_DESTROYOBJ) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
+ if (nla_put_be32(skb, NFTA_OBJ_USE, htonl(obj->use)) ||
nft_object_dump(skb, NFTA_OBJ_DATA, obj, reset))
goto nla_put_failure;
@@ -8420,7 +8449,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb)
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -8454,7 +8483,7 @@ cont:
idx++;
}
if (ctx->reset && entries)
- audit_log_obj_reset(table, nft_net->base_seq, entries);
+ audit_log_obj_reset(table, nft_base_seq(net), entries);
if (rc < 0)
break;
}
@@ -8623,7 +8652,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb,
buf = kasprintf(GFP_ATOMIC, "%.*s:%u",
nla_len(nla[NFTA_OBJ_TABLE]),
(char *)nla_data(nla[NFTA_OBJ_TABLE]),
- nft_net->base_seq);
+ nft_base_seq(net));
audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1,
AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC);
kfree(buf);
@@ -8728,9 +8757,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table,
struct nft_object *obj, u32 portid, u32 seq, int event,
u16 flags, int family, int report, gfp_t gfp)
{
- struct nftables_pernet *nft_net = nft_pernet(net);
char *buf = kasprintf(gfp, "%s:%u",
- table->name, nft_net->base_seq);
+ table->name, nft_base_seq(net));
audit_log_nfcfg(buf,
family,
@@ -9413,9 +9441,9 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
struct nft_hook *hook;
struct nlmsghdr *nlh;
- event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event);
- nlh = nfnl_msg_put(skb, portid, seq, event, flags, family,
- NFNETLINK_V0, nft_base_seq(net));
+ nlh = nfnl_msg_put(skb, portid, seq,
+ nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event),
+ flags, family, NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
@@ -9425,6 +9453,13 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
NFTA_FLOWTABLE_PAD))
goto nla_put_failure;
+ if (!hook_list &&
+ (event == NFT_MSG_DELFLOWTABLE ||
+ event == NFT_MSG_DESTROYFLOWTABLE)) {
+ nlmsg_end(skb, nlh);
+ return 0;
+ }
+
if (nla_put_be32(skb, NFTA_FLOWTABLE_USE, htonl(flowtable->use)) ||
nla_put_be32(skb, NFTA_FLOWTABLE_FLAGS, htonl(flowtable->data.flags)))
goto nla_put_failure;
@@ -9477,7 +9512,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb,
rcu_read_lock();
nft_net = nft_pernet(net);
- cb->seq = READ_ONCE(nft_net->base_seq);
+ cb->seq = nft_base_seq(net);
list_for_each_entry_rcu(table, &nft_net->tables, list) {
if (family != NFPROTO_UNSPEC && family != table->family)
@@ -9662,17 +9697,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
u32 portid, u32 seq)
{
- struct nftables_pernet *nft_net = nft_pernet(net);
struct nlmsghdr *nlh;
char buf[TASK_COMM_LEN];
int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN);
nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC,
- NFNETLINK_V0, nft_base_seq(net));
+ NFNETLINK_V0, nft_base_seq_be16(net));
if (!nlh)
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) ||
+ if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) ||
nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) ||
nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current)))
goto nla_put_failure;
@@ -10933,11 +10967,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
* Bump generation counter, invalidate any dump in progress.
* Cannot fail after this point.
*/
- base_seq = READ_ONCE(nft_net->base_seq);
+ base_seq = nft_base_seq(net);
while (++base_seq == 0)
;
- WRITE_ONCE(nft_net->base_seq, base_seq);
+ /* pairs with smp_load_acquire in nft_lookup_eval */
+ smp_store_release(&net->nft.base_seq, base_seq);
gc_seq = nft_gc_seq_begin(nft_net);
@@ -11146,7 +11181,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nft_commit_notify(net, NETLINK_CB(skb).portid);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
- nf_tables_commit_audit_log(&adl, nft_net->base_seq);
+ nf_tables_commit_audit_log(&adl, nft_base_seq(net));
nft_gc_seq_end(nft_net, gc_seq);
nft_net->validate_state = NFT_VALIDATE_SKIP;
@@ -11471,7 +11506,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid)
mutex_lock(&nft_net->commit_mutex);
nft_net->tstamp = get_jiffies_64();
- genid_ok = genid == 0 || nft_net->base_seq == genid;
+ genid_ok = genid == 0 || nft_base_seq(net) == genid;
if (!genid_ok)
mutex_unlock(&nft_net->commit_mutex);
@@ -12108,7 +12143,7 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&nft_net->module_list);
INIT_LIST_HEAD(&nft_net->notify_list);
mutex_init(&nft_net->commit_mutex);
- nft_net->base_seq = 1;
+ net->nft.base_seq = 1;
nft_net->gc_seq = 0;
nft_net->validate_state = NFT_VALIDATE_SKIP;
INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 88922e0e8e83..e24493d9e776 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -91,8 +91,9 @@ void nft_dynset_eval(const struct nft_expr *expr,
return;
}
- if (set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
- expr, regs, &ext)) {
+ ext = set->ops->update(set, &regs->data[priv->sreg_key], nft_dynset_new,
+ expr, regs);
+ if (ext) {
if (priv->op == NFT_DYNSET_OP_UPDATE &&
nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) &&
READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) {
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 63ef832b8aa7..58c5b14889c4 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -24,36 +24,73 @@ struct nft_lookup {
struct nft_set_binding binding;
};
-#ifdef CONFIG_MITIGATION_RETPOLINE
-bool nft_set_do_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+static const struct nft_set_ext *
+__nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
+#ifdef CONFIG_MITIGATION_RETPOLINE
if (set->ops == &nft_set_hash_fast_type.ops)
- return nft_hash_lookup_fast(net, set, key, ext);
+ return nft_hash_lookup_fast(net, set, key);
if (set->ops == &nft_set_hash_type.ops)
- return nft_hash_lookup(net, set, key, ext);
+ return nft_hash_lookup(net, set, key);
if (set->ops == &nft_set_rhash_type.ops)
- return nft_rhash_lookup(net, set, key, ext);
+ return nft_rhash_lookup(net, set, key);
if (set->ops == &nft_set_bitmap_type.ops)
- return nft_bitmap_lookup(net, set, key, ext);
+ return nft_bitmap_lookup(net, set, key);
if (set->ops == &nft_set_pipapo_type.ops)
- return nft_pipapo_lookup(net, set, key, ext);
+ return nft_pipapo_lookup(net, set, key);
#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
if (set->ops == &nft_set_pipapo_avx2_type.ops)
- return nft_pipapo_avx2_lookup(net, set, key, ext);
+ return nft_pipapo_avx2_lookup(net, set, key);
#endif
if (set->ops == &nft_set_rbtree_type.ops)
- return nft_rbtree_lookup(net, set, key, ext);
+ return nft_rbtree_lookup(net, set, key);
WARN_ON_ONCE(1);
- return set->ops->lookup(net, set, key, ext);
+#endif
+ return set->ops->lookup(net, set, key);
+}
+
+static unsigned int nft_base_seq(const struct net *net)
+{
+ /* pairs with smp_store_release() in nf_tables_commit() */
+ return smp_load_acquire(&net->nft.base_seq);
+}
+
+static bool nft_lookup_should_retry(const struct net *net, unsigned int seq)
+{
+ return unlikely(seq != nft_base_seq(net));
+}
+
+const struct nft_set_ext *
+nft_set_do_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ const struct nft_set_ext *ext;
+ unsigned int base_seq;
+
+ do {
+ base_seq = nft_base_seq(net);
+
+ ext = __nft_set_do_lookup(net, set, key);
+ if (ext)
+ break;
+ /* No match? There is a small chance that lookup was
+ * performed in the old generation, but nf_tables_commit()
+ * already unlinked a (matching) element.
+ *
+ * We need to repeat the lookup to make sure that we didn't
+ * miss a matching element in the new generation.
+ */
+ } while (nft_lookup_should_retry(net, base_seq));
+
+ return ext;
}
EXPORT_SYMBOL_GPL(nft_set_do_lookup);
-#endif
void nft_lookup_eval(const struct nft_expr *expr,
struct nft_regs *regs,
@@ -61,12 +98,12 @@ void nft_lookup_eval(const struct nft_expr *expr,
{
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
- const struct nft_set_ext *ext = NULL;
const struct net *net = nft_net(pkt);
+ const struct nft_set_ext *ext;
bool found;
- found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext) ^
- priv->invert;
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ found = !!ext ^ priv->invert;
if (!found) {
ext = nft_set_catchall_lookup(net, set);
if (!ext) {
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 09da7a3f9f96..8ee66a86c3bc 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -111,10 +111,9 @@ void nft_objref_map_eval(const struct nft_expr *expr,
struct net *net = nft_net(pkt);
const struct nft_set_ext *ext;
struct nft_object *obj;
- bool found;
- found = nft_set_do_lookup(net, set, &regs->data[priv->sreg], &ext);
- if (!found) {
+ ext = nft_set_do_lookup(net, set, &regs->data[priv->sreg]);
+ if (!ext) {
ext = nft_set_catchall_lookup(net, set);
if (!ext) {
regs->verdict.code = NFT_BREAK;
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index 12390d2e994f..8d3f040a904a 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -75,16 +75,21 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask)
}
INDIRECT_CALLABLE_SCOPE
-bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_bitmap_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
const struct nft_bitmap *priv = nft_set_priv(set);
+ static const struct nft_set_ext found;
u8 genmask = nft_genmask_cur(net);
u32 idx, off;
nft_bitmap_location(set, key, &idx, &off);
- return nft_bitmap_active(priv->bitmap, idx, off, genmask);
+ if (nft_bitmap_active(priv->bitmap, idx, off, genmask))
+ return &found;
+
+ return NULL;
}
static struct nft_bitmap_elem *
@@ -221,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx,
const struct nft_bitmap *priv = nft_set_priv(set);
struct nft_bitmap_elem *be;
- list_for_each_entry_rcu(be, &priv->list, head) {
+ list_for_each_entry_rcu(be, &priv->list, head,
+ lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) {
if (iter->count < iter->skip)
goto cont;
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index abb0c8ec6371..9903c737c9f0 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -81,8 +81,9 @@ static const struct rhashtable_params nft_rhash_params = {
};
INDIRECT_CALLABLE_SCOPE
-bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_rhash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rhash *priv = nft_set_priv(set);
const struct nft_rhash_elem *he;
@@ -95,9 +96,9 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set,
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL)
- *ext = &he->ext;
+ return &he->ext;
- return !!he;
+ return NULL;
}
static struct nft_elem_priv *
@@ -120,14 +121,11 @@ nft_rhash_get(const struct net *net, const struct nft_set *set,
return ERR_PTR(-ENOENT);
}
-static bool nft_rhash_update(struct nft_set *set, const u32 *key,
- struct nft_elem_priv *
- (*new)(struct nft_set *,
- const struct nft_expr *,
- struct nft_regs *regs),
- const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_set_ext **ext)
+static const struct nft_set_ext *
+nft_rhash_update(struct nft_set *set, const u32 *key,
+ struct nft_elem_priv *(*new)(struct nft_set *, const struct nft_expr *,
+ struct nft_regs *regs),
+ const struct nft_expr *expr, struct nft_regs *regs)
{
struct nft_rhash *priv = nft_set_priv(set);
struct nft_rhash_elem *he, *prev;
@@ -161,14 +159,13 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key,
}
out:
- *ext = &he->ext;
- return true;
+ return &he->ext;
err2:
nft_set_elem_destroy(set, &he->priv, true);
atomic_dec(&set->nelems);
err1:
- return false;
+ return NULL;
}
static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
@@ -507,8 +504,9 @@ struct nft_hash_elem {
};
INDIRECT_CALLABLE_SCOPE
-bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_hash_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -519,12 +517,10 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set,
hash = reciprocal_scale(hash, priv->buckets);
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
}
static struct nft_elem_priv *
@@ -547,9 +543,9 @@ nft_hash_get(const struct net *net, const struct nft_set *set,
}
INDIRECT_CALLABLE_SCOPE
-bool nft_hash_lookup_fast(const struct net *net,
- const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_hash_lookup_fast(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
@@ -562,12 +558,10 @@ bool nft_hash_lookup_fast(const struct net *net,
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
k2 = *(u32 *)nft_set_ext_key(&he->ext)->data;
if (k1 == k2 &&
- nft_set_elem_active(&he->ext, genmask)) {
- *ext = &he->ext;
- return true;
- }
+ nft_set_elem_active(&he->ext, genmask))
+ return &he->ext;
}
- return false;
+ return NULL;
}
static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv,
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 9e4e25f2458f..793790d79d13 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -397,37 +397,38 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
}
/**
- * nft_pipapo_lookup() - Lookup function
- * @net: Network namespace
- * @set: nftables API set representation
- * @key: nftables API element representation containing key data
- * @ext: nftables API extension pointer, filled with matching reference
+ * pipapo_get() - Get matching element reference given key data
+ * @m: storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation.
*
- * Return: true on match, false otherwise.
+ * This is the main lookup function. It matches key data against either
+ * the working match set or the uncommitted copy, depending on what the
+ * caller passed to us.
+ * nft_pipapo_get (lookup from userspace/control plane) and nft_pipapo_lookup
+ * (datapath lookup) pass the active copy.
+ * The insertion path will pass the uncommitted working copy.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo *priv = nft_set_priv(set);
struct nft_pipapo_scratch *scratch;
unsigned long *res_map, *fill_map;
- u8 genmask = nft_genmask_cur(net);
- const struct nft_pipapo_match *m;
const struct nft_pipapo_field *f;
- const u8 *rp = (const u8 *)key;
bool map_index;
int i;
local_bh_disable();
- m = rcu_dereference(priv->match);
-
- if (unlikely(!m || !*raw_cpu_ptr(m->scratch)))
- goto out;
-
scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
+ goto out;
map_index = scratch->map_index;
@@ -444,12 +445,12 @@ bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
* packet bytes value, then AND bucket value
*/
if (likely(f->bb == 8))
- pipapo_and_field_buckets_8bit(f, res_map, rp);
+ pipapo_and_field_buckets_8bit(f, res_map, data);
else
- pipapo_and_field_buckets_4bit(f, res_map, rp);
+ pipapo_and_field_buckets_4bit(f, res_map, data);
NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
- rp += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
+ data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
/* Now populate the bitmap for the next field, unless this is
* the last field, in which case return the matched 'ext'
@@ -465,13 +466,15 @@ next_match:
scratch->map_index = map_index;
local_bh_enable();
- return false;
+ return NULL;
}
if (last) {
- *ext = &f->mt[b].e->ext;
- if (unlikely(nft_set_elem_expired(*ext) ||
- !nft_set_elem_active(*ext, genmask)))
+ struct nft_pipapo_elem *e;
+
+ e = f->mt[b].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask)))
goto next_match;
/* Last field: we're just returning the key without
@@ -481,8 +484,7 @@ next_match:
*/
scratch->map_index = map_index;
local_bh_enable();
-
- return true;
+ return e;
}
/* Swap bitmap indices: res_map is the initial bitmap for the
@@ -492,112 +494,54 @@ next_match:
map_index = !map_index;
swap(res_map, fill_map);
- rp += NFT_PIPAPO_GROUPS_PADDING(f);
+ data += NFT_PIPAPO_GROUPS_PADDING(f);
}
out:
local_bh_enable();
- return false;
+ return NULL;
}
/**
- * pipapo_get() - Get matching element reference given key data
+ * nft_pipapo_lookup() - Dataplane fronted for main lookup function
* @net: Network namespace
* @set: nftables API set representation
- * @m: storage containing active/existing elements
- * @data: Key data to be matched against existing elements
- * @genmask: If set, check that element is active in given genmask
- * @tstamp: timestamp to check for expired elements
- * @gfp: the type of memory to allocate (see kmalloc).
+ * @key: pointer to nft registers containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy.
+ * Unlike other set types, this uses NFT_GENMASK_ANY instead of
+ * nft_genmask_cur().
*
- * This is essentially the same as the lookup function, except that it matches
- * key data against the uncommitted copy and doesn't use preallocated maps for
- * bitmap results.
+ * This is because new (future) elements are not reachable from
+ * priv->match, they get added to priv->clone instead.
+ * When the commit phase flips the generation bitmask, the
+ * 'now old' entries are skipped but without the 'now current'
+ * elements becoming visible. Using nft_genmask_cur() thus creates
+ * inconsistent state: matching old entries get skipped but thew
+ * newly matching entries are unreachable.
*
- * Return: pointer to &struct nft_pipapo_elem on match, error pointer otherwise.
+ * GENMASK will still find the 'now old' entries which ensures consistent
+ * priv->match view.
+ *
+ * nft_pipapo_commit swaps ->clone and ->match shortly after the
+ * genbit flip. As ->clone doesn't contain the old entries in the first
+ * place, lookup will only find the now-current ones.
+ *
+ * Return: ntables API extension pointer or NULL if no match.
*/
-static struct nft_pipapo_elem *pipapo_get(const struct net *net,
- const struct nft_set *set,
- const struct nft_pipapo_match *m,
- const u8 *data, u8 genmask,
- u64 tstamp, gfp_t gfp)
+const struct nft_set_ext *
+nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
- struct nft_pipapo_elem *ret = ERR_PTR(-ENOENT);
- unsigned long *res_map, *fill_map = NULL;
- const struct nft_pipapo_field *f;
- int i;
-
- if (m->bsize_max == 0)
- return ret;
-
- res_map = kmalloc_array(m->bsize_max, sizeof(*res_map), gfp);
- if (!res_map) {
- ret = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- fill_map = kcalloc(m->bsize_max, sizeof(*res_map), gfp);
- if (!fill_map) {
- ret = ERR_PTR(-ENOMEM);
- goto out;
- }
-
- pipapo_resmap_init(m, res_map);
-
- nft_pipapo_for_each_field(f, i, m) {
- bool last = i == m->field_count - 1;
- int b;
-
- /* For each bit group: select lookup table bucket depending on
- * packet bytes value, then AND bucket value
- */
- if (f->bb == 8)
- pipapo_and_field_buckets_8bit(f, res_map, data);
- else if (f->bb == 4)
- pipapo_and_field_buckets_4bit(f, res_map, data);
- else
- BUG();
-
- data += f->groups / NFT_PIPAPO_GROUPS_PER_BYTE(f);
-
- /* Now populate the bitmap for the next field, unless this is
- * the last field, in which case return the matched 'ext'
- * pointer if any.
- *
- * Now res_map contains the matching bitmap, and fill_map is the
- * bitmap for the next field.
- */
-next_match:
- b = pipapo_refill(res_map, f->bsize, f->rules, fill_map, f->mt,
- last);
- if (b < 0)
- goto out;
-
- if (last) {
- if (__nft_set_elem_expired(&f->mt[b].e->ext, tstamp))
- goto next_match;
- if ((genmask &&
- !nft_set_elem_active(&f->mt[b].e->ext, genmask)))
- goto next_match;
-
- ret = f->mt[b].e;
- goto out;
- }
-
- data += NFT_PIPAPO_GROUPS_PADDING(f);
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const struct nft_pipapo_elem *e;
- /* Swap bitmap indices: fill_map will be the initial bitmap for
- * the next field (i.e. the new res_map), and res_map is
- * guaranteed to be all-zeroes at this point, ready to be filled
- * according to the next mapping table.
- */
- swap(res_map, fill_map);
- }
+ m = rcu_dereference(priv->match);
+ e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64());
-out:
- kfree(fill_map);
- kfree(res_map);
- return ret;
+ return e ? &e->ext : NULL;
}
/**
@@ -606,6 +550,11 @@ out:
* @set: nftables API set representation
* @elem: nftables API element representation containing key data
* @flags: Unused
+ *
+ * This function is called from the control plane path under
+ * RCU read lock.
+ *
+ * Return: set element private pointer or ERR_PTR(-ENOENT).
*/
static struct nft_elem_priv *
nft_pipapo_get(const struct net *net, const struct nft_set *set,
@@ -615,11 +564,10 @@ nft_pipapo_get(const struct net *net, const struct nft_set *set,
struct nft_pipapo_match *m = rcu_dereference(priv->match);
struct nft_pipapo_elem *e;
- e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data,
- nft_genmask_cur(net), get_jiffies_64(),
- GFP_ATOMIC);
- if (IS_ERR(e))
- return ERR_CAST(e);
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_cur(net), get_jiffies_64());
+ if (!e)
+ return ERR_PTR(-ENOENT);
return &e->priv;
}
@@ -1344,8 +1292,8 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
else
end = start;
- dup = pipapo_get(net, set, m, start, genmask, tstamp, GFP_KERNEL);
- if (!IS_ERR(dup)) {
+ dup = pipapo_get(m, start, genmask, tstamp);
+ if (dup) {
/* Check if we already have the same exact entry */
const struct nft_data *dup_key, *dup_end;
@@ -1364,15 +1312,9 @@ static int nft_pipapo_insert(const struct net *net, const struct nft_set *set,
return -ENOTEMPTY;
}
- if (PTR_ERR(dup) == -ENOENT) {
- /* Look for partially overlapping entries */
- dup = pipapo_get(net, set, m, end, nft_genmask_next(net), tstamp,
- GFP_KERNEL);
- }
-
- if (PTR_ERR(dup) != -ENOENT) {
- if (IS_ERR(dup))
- return PTR_ERR(dup);
+ /* Look for partially overlapping entries */
+ dup = pipapo_get(m, end, nft_genmask_next(net), tstamp);
+ if (dup) {
*elem_priv = &dup->priv;
return -ENOTEMPTY;
}
@@ -1913,9 +1855,9 @@ nft_pipapo_deactivate(const struct net *net, const struct nft_set *set,
if (!m)
return NULL;
- e = pipapo_get(net, set, m, (const u8 *)elem->key.val.data,
- nft_genmask_next(net), nft_net_tstamp(net), GFP_KERNEL);
- if (IS_ERR(e))
+ e = pipapo_get(m, (const u8 *)elem->key.val.data,
+ nft_genmask_next(net), nft_net_tstamp(net));
+ if (!e)
return NULL;
nft_set_elem_change_active(net, set, &e->ext);
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index be7c16c79f71..39e356c9687a 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -1146,26 +1146,27 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns
*
* Return: true on match, false otherwise.
*/
-bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_set_ext *ext = NULL;
struct nft_pipapo_scratch *scratch;
- u8 genmask = nft_genmask_cur(net);
const struct nft_pipapo_match *m;
const struct nft_pipapo_field *f;
const u8 *rp = (const u8 *)key;
unsigned long *res, *fill;
bool map_index;
- int i, ret = 0;
+ int i;
local_bh_disable();
if (unlikely(!irq_fpu_usable())) {
- bool fallback_res = nft_pipapo_lookup(net, set, key, ext);
+ ext = nft_pipapo_lookup(net, set, key);
local_bh_enable();
- return fallback_res;
+ return ext;
}
m = rcu_dereference(priv->match);
@@ -1182,7 +1183,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
if (unlikely(!scratch)) {
kernel_fpu_end();
local_bh_enable();
- return false;
+ return NULL;
}
map_index = scratch->map_index;
@@ -1197,6 +1198,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
next_match:
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1, first = !i;
+ int ret = 0;
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
@@ -1244,13 +1246,12 @@ next_match:
goto out;
if (last) {
- *ext = &f->mt[ret].e->ext;
- if (unlikely(nft_set_elem_expired(*ext) ||
- !nft_set_elem_active(*ext, genmask))) {
- ret = 0;
+ const struct nft_set_ext *e = &f->mt[ret].e->ext;
+
+ if (unlikely(nft_set_elem_expired(e)))
goto next_match;
- }
+ ext = e;
goto out;
}
@@ -1264,5 +1265,5 @@ out:
kernel_fpu_end();
local_bh_enable();
- return ret >= 0;
+ return ext;
}
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 2e8ef16ff191..b1f04168ec93 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -52,9 +52,9 @@ static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
return nft_set_elem_expired(&rbe->ext);
}
-static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext,
- unsigned int seq)
+static const struct nft_set_ext *
+__nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key, unsigned int seq)
{
struct nft_rbtree *priv = nft_set_priv(set);
const struct nft_rbtree_elem *rbe, *interval = NULL;
@@ -65,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
parent = rcu_dereference_raw(priv->root.rb_node);
while (parent != NULL) {
if (read_seqcount_retry(&priv->count, seq))
- return false;
+ return NULL;
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
@@ -77,7 +77,9 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_start(interval))
continue;
- interval = rbe;
+ if (nft_set_elem_active(&rbe->ext, genmask) &&
+ !nft_rbtree_elem_expired(rbe))
+ interval = rbe;
} else if (d > 0)
parent = rcu_dereference_raw(parent->rb_right);
else {
@@ -87,50 +89,46 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
}
if (nft_rbtree_elem_expired(rbe))
- return false;
+ return NULL;
if (nft_rbtree_interval_end(rbe)) {
if (nft_set_is_anonymous(set))
- return false;
+ return NULL;
parent = rcu_dereference_raw(parent->rb_left);
interval = NULL;
continue;
}
- *ext = &rbe->ext;
- return true;
+ return &rbe->ext;
}
}
if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
- nft_set_elem_active(&interval->ext, genmask) &&
- !nft_rbtree_elem_expired(interval) &&
- nft_rbtree_interval_start(interval)) {
- *ext = &interval->ext;
- return true;
- }
+ nft_rbtree_interval_start(interval))
+ return &interval->ext;
- return false;
+ return NULL;
}
INDIRECT_CALLABLE_SCOPE
-bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, const struct nft_set_ext **ext)
+const struct nft_set_ext *
+nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
{
struct nft_rbtree *priv = nft_set_priv(set);
unsigned int seq = read_seqcount_begin(&priv->count);
- bool ret;
+ const struct nft_set_ext *ext;
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
- if (ret || !read_seqcount_retry(&priv->count, seq))
- return ret;
+ ext = __nft_rbtree_lookup(net, set, key, seq);
+ if (ext || !read_seqcount_retry(&priv->count, seq))
+ return ext;
read_lock_bh(&priv->lock);
seq = read_seqcount_begin(&priv->count);
- ret = __nft_rbtree_lookup(net, set, key, ext, seq);
+ ext = __nft_rbtree_lookup(net, set, key, seq);
read_unlock_bh(&priv->lock);
- return ret;
+ return ext;
}
static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 104732d34543..978c129c6095 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group)
!ns_capable(net->user_ns, CAP_SYS_ADMIN))
ret = -EPERM;
+ if (ret)
+ break;
+
if (family->bind)
family->bind(i);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 73bc39281ef5..9b45fbdc90ca 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- if (unlikely(current->flags & PF_EXITING))
- return -EINTR;
schedule();
if (signal_pending_state(mode, current))
return -ERESTARTSYS;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index c5f7bbf5775f..3aa987e7f072 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags)
iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1,
alert_kvec.iov_len);
ret = sock_recvmsg(sock, &msg, flags);
- if (ret > 0 &&
- tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) {
- iov_iter_revert(&msg.msg_iter, ret);
+ if (ret > 0) {
+ if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT)
+ iov_iter_revert(&msg.msg_iter, ret);
ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg,
-EAGAIN);
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 72c000c0ae5f..de331541fdb3 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -36,6 +36,20 @@
#define TX_BATCH_SIZE 32
#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE)
+struct xsk_addr_node {
+ u64 addr;
+ struct list_head addr_node;
+};
+
+struct xsk_addr_head {
+ u32 num_descs;
+ struct list_head addrs_list;
+};
+
+static struct kmem_cache *xsk_tx_generic_cache;
+
+#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
+
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
{
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -528,24 +542,43 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags)
return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags);
}
-static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr)
+static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&pool->cq_lock, flags);
- ret = xskq_prod_reserve_addr(pool->cq, addr);
+ ret = xskq_prod_reserve(pool->cq);
spin_unlock_irqrestore(&pool->cq_lock, flags);
return ret;
}
-static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n)
+static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
+ struct sk_buff *skb)
{
+ struct xsk_addr_node *pos, *tmp;
+ u32 descs_processed = 0;
unsigned long flags;
+ u32 idx;
spin_lock_irqsave(&pool->cq_lock, flags);
- xskq_prod_submit_n(pool->cq, n);
+ idx = xskq_get_prod(pool->cq);
+
+ xskq_prod_write_addr(pool->cq, idx,
+ (u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
+ descs_processed++;
+
+ if (unlikely(XSKCB(skb)->num_descs > 1)) {
+ list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+ xskq_prod_write_addr(pool->cq, idx + descs_processed,
+ pos->addr);
+ descs_processed++;
+ list_del(&pos->addr_node);
+ kmem_cache_free(xsk_tx_generic_cache, pos);
+ }
+ }
+ xskq_prod_submit_n(pool->cq, descs_processed);
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
@@ -558,9 +591,14 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
spin_unlock_irqrestore(&pool->cq_lock, flags);
}
+static void xsk_inc_num_desc(struct sk_buff *skb)
+{
+ XSKCB(skb)->num_descs++;
+}
+
static u32 xsk_get_num_desc(struct sk_buff *skb)
{
- return skb ? (long)skb_shinfo(skb)->destructor_arg : 0;
+ return XSKCB(skb)->num_descs;
}
static void xsk_destruct_skb(struct sk_buff *skb)
@@ -572,23 +610,33 @@ static void xsk_destruct_skb(struct sk_buff *skb)
*compl->tx_timestamp = ktime_get_tai_fast_ns();
}
- xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb));
+ xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb);
sock_wfree(skb);
}
-static void xsk_set_destructor_arg(struct sk_buff *skb)
+static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr)
{
- long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1;
-
- skb_shinfo(skb)->destructor_arg = (void *)num;
+ BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
+ INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
+ XSKCB(skb)->num_descs = 0;
+ skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
}
static void xsk_consume_skb(struct sk_buff *skb)
{
struct xdp_sock *xs = xdp_sk(skb->sk);
+ u32 num_descs = xsk_get_num_desc(skb);
+ struct xsk_addr_node *pos, *tmp;
+
+ if (unlikely(num_descs > 1)) {
+ list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
+ list_del(&pos->addr_node);
+ kmem_cache_free(xsk_tx_generic_cache, pos);
+ }
+ }
skb->destructor = sock_wfree;
- xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb));
+ xsk_cq_cancel_locked(xs->pool, num_descs);
/* Free skb without triggering the perf drop trace */
consume_skb(skb);
xs->skb = NULL;
@@ -605,6 +653,7 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
{
struct xsk_buff_pool *pool = xs->pool;
u32 hr, len, ts, offset, copy, copied;
+ struct xsk_addr_node *xsk_addr;
struct sk_buff *skb = xs->skb;
struct page *page;
void *buffer;
@@ -619,6 +668,19 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
return ERR_PTR(err);
skb_reserve(skb, hr);
+
+ xsk_set_destructor_arg(skb, desc->addr);
+ } else {
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (!xsk_addr)
+ return ERR_PTR(-ENOMEM);
+
+ /* in case of -EOVERFLOW that could happen below,
+ * xsk_consume_skb() will release this node as whole skb
+ * would be dropped, which implies freeing all list elements
+ */
+ xsk_addr->addr = desc->addr;
+ list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
}
addr = desc->addr;
@@ -690,8 +752,11 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
err = skb_store_bits(skb, 0, buffer, len);
if (unlikely(err))
goto free_err;
+
+ xsk_set_destructor_arg(skb, desc->addr);
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
+ struct xsk_addr_node *xsk_addr;
struct page *page;
u8 *vaddr;
@@ -706,12 +771,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
goto free_err;
}
+ xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
+ if (!xsk_addr) {
+ __free_page(page);
+ err = -ENOMEM;
+ goto free_err;
+ }
+
vaddr = kmap_local_page(page);
memcpy(vaddr, buffer, len);
kunmap_local(vaddr);
skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
+
+ xsk_addr->addr = desc->addr;
+ list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
}
if (first_frag && desc->options & XDP_TX_METADATA) {
@@ -755,7 +830,7 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
skb->mark = READ_ONCE(xs->sk.sk_mark);
skb->destructor = xsk_destruct_skb;
xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
- xsk_set_destructor_arg(skb);
+ xsk_inc_num_desc(skb);
return skb;
@@ -765,7 +840,7 @@ free_err:
if (err == -EOVERFLOW) {
/* Drop the packet */
- xsk_set_destructor_arg(xs->skb);
+ xsk_inc_num_desc(xs->skb);
xsk_drop_skb(xs->skb);
xskq_cons_release(xs->tx);
} else {
@@ -807,7 +882,7 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+ err = xsk_cq_reserve_locked(xs->pool);
if (err) {
err = -EAGAIN;
goto out;
@@ -1795,8 +1870,18 @@ static int __init xsk_init(void)
if (err)
goto out_pernet;
+ xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
+ sizeof(struct xsk_addr_node),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!xsk_tx_generic_cache) {
+ err = -ENOMEM;
+ goto out_unreg_notif;
+ }
+
return 0;
+out_unreg_notif:
+ unregister_netdevice_notifier(&xsk_netdev_notifier);
out_pernet:
unregister_pernet_subsys(&xsk_net_ops);
out_sk:
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 46d87e961ad6..f16f390370dc 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -344,6 +344,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q)
/* Functions for producers */
+static inline u32 xskq_get_prod(struct xsk_queue *q)
+{
+ return READ_ONCE(q->ring->producer);
+}
+
static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max)
{
u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons);
@@ -390,6 +395,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr)
return 0;
}
+static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr)
+{
+ struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
+
+ ring->desc[idx & q->ring_mask] = addr;
+}
+
static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs,
u32 nb_entries)
{