summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaolo Abeni <pabeni@redhat.com>2025-01-09 15:33:10 +0100
committerPaolo Abeni <pabeni@redhat.com>2025-01-09 15:33:11 +0100
commit11c668db098507207d55c5d6e04cc34a55288636 (patch)
tree9eb6e497555e24d5ae38f2a4b0c4871e9384b900
parenta3b3d2dc389568a77d0e25da17203e3616218e93 (diff)
parenteb721f117e7d43b561e81dd878c4acfa2de13ee2 (diff)
Merge branch 'net-make-sure-we-retain-napi-ordering-on-netdev-napi_list'
Jakub Kicinski says: ==================== net: make sure we retain NAPI ordering on netdev->napi_list I promised Eric to remove the rtnl protection of the NAPI list, when I sat down to implement it over the break I realized that the recently added NAPI ID retention will break the list ordering assumption we have in netlink dump. The ordering used to happen "naturally", because we'd always add NAPIs that the head of the list, and assign a new monotonically increasing ID. Before the first patch of this series we'd still only add at the head of the list but now the newly added NAPI may inherit from its config an ID lower than something else already on the list. The fix is in the first patch, the rest is netdevsim churn to test it. I'm posting this for net-next, because AFAICT the problem can't be triggered in net, given the very limited queue API adoption. v2: - [patch 2] allocate the array with kcalloc() instead of kvcalloc() - [patch 2] set GFP_KERNEL_ACCOUNT when allocating queues - [patch 6] don't null-check page pool before page_pool_destroy() - [patch 6] controled -> controlled - [patch 7] change mode to 0200 - [patch 7] reorder removal to be inverse of add - [patch 7] fix the spaces vs tabs v1: https://lore.kernel.org/20250103185954.1236510-1-kuba@kernel.org ==================== Link: https://patch.msgid.link/20250107160846.2223263-1-kuba@kernel.org Signed-off-by: Paolo Abeni <pabeni@redhat.com>
-rw-r--r--Documentation/networking/netdevices.rst10
-rw-r--r--drivers/net/netdevsim/netdev.c259
-rw-r--r--drivers/net/netdevsim/netdevsim.h5
-rw-r--r--net/core/dev.c42
-rw-r--r--net/core/netdev_rx_queue.c1
-rwxr-xr-xtools/testing/selftests/net/nl_netdev.py19
6 files changed, 294 insertions, 42 deletions
diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst
index 857c9784f87e..1d37038e9fbe 100644
--- a/Documentation/networking/netdevices.rst
+++ b/Documentation/networking/netdevices.rst
@@ -297,3 +297,13 @@ napi->poll:
Context:
softirq
will be called with interrupts disabled by netconsole.
+
+NETDEV_INTERNAL symbol namespace
+================================
+
+Symbols exported as NETDEV_INTERNAL can only be used in networking
+core and drivers which exclusively flow via the main networking list and trees.
+Note that the inverse is not true, most symbols outside of NETDEV_INTERNAL
+are not expected to be used by random code outside netdev either.
+Symbols may lack the designation because they predate the namespaces,
+or simply due to an oversight.
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index e068a9761c09..d013b6498539 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -20,6 +20,7 @@
#include <linux/netdevice.h>
#include <linux/slab.h>
#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
#include <net/page_pool/helpers.h>
#include <net/netlink.h>
#include <net/net_shaper.h>
@@ -29,6 +30,8 @@
#include "netdevsim.h"
+MODULE_IMPORT_NS("NETDEV_INTERNAL");
+
#define NSIM_RING_SIZE 256
static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb)
@@ -69,7 +72,7 @@ static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
rxq = skb_get_queue_mapping(skb);
if (rxq >= peer_dev->num_rx_queues)
rxq = rxq % peer_dev->num_rx_queues;
- rq = &peer_ns->rq[rxq];
+ rq = peer_ns->rq[rxq];
skb_tx_timestamp(skb);
if (unlikely(nsim_forward_skb(peer_dev, skb, rq) == NET_RX_DROP))
@@ -359,25 +362,24 @@ static int nsim_poll(struct napi_struct *napi, int budget)
return done;
}
-static int nsim_create_page_pool(struct nsim_rq *rq)
+static int nsim_create_page_pool(struct page_pool **p, struct napi_struct *napi)
{
- struct page_pool_params p = {
+ struct page_pool_params params = {
.order = 0,
.pool_size = NSIM_RING_SIZE,
.nid = NUMA_NO_NODE,
- .dev = &rq->napi.dev->dev,
- .napi = &rq->napi,
+ .dev = &napi->dev->dev,
+ .napi = napi,
.dma_dir = DMA_BIDIRECTIONAL,
- .netdev = rq->napi.dev,
+ .netdev = napi->dev,
};
+ struct page_pool *pool;
- rq->page_pool = page_pool_create(&p);
- if (IS_ERR(rq->page_pool)) {
- int err = PTR_ERR(rq->page_pool);
+ pool = page_pool_create(&params);
+ if (IS_ERR(pool))
+ return PTR_ERR(pool);
- rq->page_pool = NULL;
- return err;
- }
+ *p = pool;
return 0;
}
@@ -388,15 +390,15 @@ static int nsim_init_napi(struct netdevsim *ns)
int err, i;
for (i = 0; i < dev->num_rx_queues; i++) {
- rq = &ns->rq[i];
+ rq = ns->rq[i];
- netif_napi_add(dev, &rq->napi, nsim_poll);
+ netif_napi_add_config(dev, &rq->napi, nsim_poll, i);
}
for (i = 0; i < dev->num_rx_queues; i++) {
- rq = &ns->rq[i];
+ rq = ns->rq[i];
- err = nsim_create_page_pool(rq);
+ err = nsim_create_page_pool(&rq->page_pool, &rq->napi);
if (err)
goto err_pp_destroy;
}
@@ -405,12 +407,12 @@ static int nsim_init_napi(struct netdevsim *ns)
err_pp_destroy:
while (i--) {
- page_pool_destroy(ns->rq[i].page_pool);
- ns->rq[i].page_pool = NULL;
+ page_pool_destroy(ns->rq[i]->page_pool);
+ ns->rq[i]->page_pool = NULL;
}
for (i = 0; i < dev->num_rx_queues; i++)
- __netif_napi_del(&ns->rq[i].napi);
+ __netif_napi_del(&ns->rq[i]->napi);
return err;
}
@@ -421,7 +423,7 @@ static void nsim_enable_napi(struct netdevsim *ns)
int i;
for (i = 0; i < dev->num_rx_queues; i++) {
- struct nsim_rq *rq = &ns->rq[i];
+ struct nsim_rq *rq = ns->rq[i];
netif_queue_set_napi(dev, i, NETDEV_QUEUE_TYPE_RX, &rq->napi);
napi_enable(&rq->napi);
@@ -448,7 +450,7 @@ static void nsim_del_napi(struct netdevsim *ns)
int i;
for (i = 0; i < dev->num_rx_queues; i++) {
- struct nsim_rq *rq = &ns->rq[i];
+ struct nsim_rq *rq = ns->rq[i];
napi_disable(&rq->napi);
__netif_napi_del(&rq->napi);
@@ -456,8 +458,8 @@ static void nsim_del_napi(struct netdevsim *ns)
synchronize_net();
for (i = 0; i < dev->num_rx_queues; i++) {
- page_pool_destroy(ns->rq[i].page_pool);
- ns->rq[i].page_pool = NULL;
+ page_pool_destroy(ns->rq[i]->page_pool);
+ ns->rq[i]->page_pool = NULL;
}
}
@@ -595,6 +597,182 @@ static const struct netdev_stat_ops nsim_stat_ops = {
.get_base_stats = nsim_get_base_stats,
};
+static struct nsim_rq *nsim_queue_alloc(void)
+{
+ struct nsim_rq *rq;
+
+ rq = kzalloc(sizeof(*rq), GFP_KERNEL_ACCOUNT);
+ if (!rq)
+ return NULL;
+
+ skb_queue_head_init(&rq->skb_queue);
+ return rq;
+}
+
+static void nsim_queue_free(struct nsim_rq *rq)
+{
+ skb_queue_purge_reason(&rq->skb_queue, SKB_DROP_REASON_QUEUE_PURGE);
+ kfree(rq);
+}
+
+/* Queue reset mode is controlled by ns->rq_reset_mode.
+ * - normal - new NAPI new pool (old NAPI enabled when new added)
+ * - mode 1 - allocate new pool (NAPI is only disabled / enabled)
+ * - mode 2 - new NAPI new pool (old NAPI removed before new added)
+ * - mode 3 - new NAPI new pool (old NAPI disabled when new added)
+ */
+struct nsim_queue_mem {
+ struct nsim_rq *rq;
+ struct page_pool *pp;
+};
+
+static int
+nsim_queue_mem_alloc(struct net_device *dev, void *per_queue_mem, int idx)
+{
+ struct nsim_queue_mem *qmem = per_queue_mem;
+ struct netdevsim *ns = netdev_priv(dev);
+ int err;
+
+ if (ns->rq_reset_mode > 3)
+ return -EINVAL;
+
+ if (ns->rq_reset_mode == 1)
+ return nsim_create_page_pool(&qmem->pp, &ns->rq[idx]->napi);
+
+ qmem->rq = nsim_queue_alloc();
+ if (!qmem->rq)
+ return -ENOMEM;
+
+ err = nsim_create_page_pool(&qmem->rq->page_pool, &qmem->rq->napi);
+ if (err)
+ goto err_free;
+
+ if (!ns->rq_reset_mode)
+ netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx);
+
+ return 0;
+
+err_free:
+ nsim_queue_free(qmem->rq);
+ return err;
+}
+
+static void nsim_queue_mem_free(struct net_device *dev, void *per_queue_mem)
+{
+ struct nsim_queue_mem *qmem = per_queue_mem;
+ struct netdevsim *ns = netdev_priv(dev);
+
+ page_pool_destroy(qmem->pp);
+ if (qmem->rq) {
+ if (!ns->rq_reset_mode)
+ netif_napi_del(&qmem->rq->napi);
+ page_pool_destroy(qmem->rq->page_pool);
+ nsim_queue_free(qmem->rq);
+ }
+}
+
+static int
+nsim_queue_start(struct net_device *dev, void *per_queue_mem, int idx)
+{
+ struct nsim_queue_mem *qmem = per_queue_mem;
+ struct netdevsim *ns = netdev_priv(dev);
+
+ if (ns->rq_reset_mode == 1) {
+ ns->rq[idx]->page_pool = qmem->pp;
+ napi_enable(&ns->rq[idx]->napi);
+ return 0;
+ }
+
+ /* netif_napi_add()/_del() should normally be called from alloc/free,
+ * here we want to test various call orders.
+ */
+ if (ns->rq_reset_mode == 2) {
+ netif_napi_del(&ns->rq[idx]->napi);
+ netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx);
+ } else if (ns->rq_reset_mode == 3) {
+ netif_napi_add_config(dev, &qmem->rq->napi, nsim_poll, idx);
+ netif_napi_del(&ns->rq[idx]->napi);
+ }
+
+ ns->rq[idx] = qmem->rq;
+ napi_enable(&ns->rq[idx]->napi);
+
+ return 0;
+}
+
+static int nsim_queue_stop(struct net_device *dev, void *per_queue_mem, int idx)
+{
+ struct nsim_queue_mem *qmem = per_queue_mem;
+ struct netdevsim *ns = netdev_priv(dev);
+
+ napi_disable(&ns->rq[idx]->napi);
+
+ if (ns->rq_reset_mode == 1) {
+ qmem->pp = ns->rq[idx]->page_pool;
+ page_pool_disable_direct_recycling(qmem->pp);
+ } else {
+ qmem->rq = ns->rq[idx];
+ }
+
+ return 0;
+}
+
+static const struct netdev_queue_mgmt_ops nsim_queue_mgmt_ops = {
+ .ndo_queue_mem_size = sizeof(struct nsim_queue_mem),
+ .ndo_queue_mem_alloc = nsim_queue_mem_alloc,
+ .ndo_queue_mem_free = nsim_queue_mem_free,
+ .ndo_queue_start = nsim_queue_start,
+ .ndo_queue_stop = nsim_queue_stop,
+};
+
+static ssize_t
+nsim_qreset_write(struct file *file, const char __user *data,
+ size_t count, loff_t *ppos)
+{
+ struct netdevsim *ns = file->private_data;
+ unsigned int queue, mode;
+ char buf[32];
+ ssize_t ret;
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+ if (copy_from_user(buf, data, count))
+ return -EFAULT;
+ buf[count] = '\0';
+
+ ret = sscanf(buf, "%u %u", &queue, &mode);
+ if (ret != 2)
+ return -EINVAL;
+
+ rtnl_lock();
+ if (!netif_running(ns->netdev)) {
+ ret = -ENETDOWN;
+ goto exit_unlock;
+ }
+
+ if (queue >= ns->netdev->real_num_rx_queues) {
+ ret = -EINVAL;
+ goto exit_unlock;
+ }
+
+ ns->rq_reset_mode = mode;
+ ret = netdev_rx_queue_restart(ns->netdev, queue);
+ ns->rq_reset_mode = 0;
+ if (ret)
+ goto exit_unlock;
+
+ ret = count;
+exit_unlock:
+ rtnl_unlock();
+ return ret;
+}
+
+static const struct file_operations nsim_qreset_fops = {
+ .open = simple_open,
+ .write = nsim_qreset_write,
+ .owner = THIS_MODULE,
+};
+
static ssize_t
nsim_pp_hold_read(struct file *file, char __user *data,
size_t count, loff_t *ppos)
@@ -628,7 +806,7 @@ nsim_pp_hold_write(struct file *file, const char __user *data,
if (!netif_running(ns->netdev) && val) {
ret = -ENETDOWN;
} else if (val) {
- ns->page = page_pool_dev_alloc_pages(ns->rq[0].page_pool);
+ ns->page = page_pool_dev_alloc_pages(ns->rq[0]->page_pool);
if (!ns->page)
ret = -ENOMEM;
} else {
@@ -677,27 +855,35 @@ static int nsim_queue_init(struct netdevsim *ns)
struct net_device *dev = ns->netdev;
int i;
- ns->rq = kvcalloc(dev->num_rx_queues, sizeof(*ns->rq),
- GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ ns->rq = kcalloc(dev->num_rx_queues, sizeof(*ns->rq),
+ GFP_KERNEL_ACCOUNT);
if (!ns->rq)
return -ENOMEM;
- for (i = 0; i < dev->num_rx_queues; i++)
- skb_queue_head_init(&ns->rq[i].skb_queue);
+ for (i = 0; i < dev->num_rx_queues; i++) {
+ ns->rq[i] = nsim_queue_alloc();
+ if (!ns->rq[i])
+ goto err_free_prev;
+ }
return 0;
+
+err_free_prev:
+ while (i--)
+ kfree(ns->rq[i]);
+ kfree(ns->rq);
+ return -ENOMEM;
}
-static void nsim_queue_free(struct netdevsim *ns)
+static void nsim_queue_uninit(struct netdevsim *ns)
{
struct net_device *dev = ns->netdev;
int i;
for (i = 0; i < dev->num_rx_queues; i++)
- skb_queue_purge_reason(&ns->rq[i].skb_queue,
- SKB_DROP_REASON_QUEUE_PURGE);
+ nsim_queue_free(ns->rq[i]);
- kvfree(ns->rq);
+ kfree(ns->rq);
ns->rq = NULL;
}
@@ -713,6 +899,7 @@ static int nsim_init_netdevsim(struct netdevsim *ns)
ns->phc = phc;
ns->netdev->netdev_ops = &nsim_netdev_ops;
ns->netdev->stat_ops = &nsim_stat_ops;
+ ns->netdev->queue_mgmt_ops = &nsim_queue_mgmt_ops;
err = nsim_udp_tunnels_info_create(ns->nsim_dev, ns->netdev);
if (err)
@@ -741,7 +928,7 @@ err_ipsec_teardown:
nsim_macsec_teardown(ns);
nsim_bpf_uninit(ns);
err_rq_destroy:
- nsim_queue_free(ns);
+ nsim_queue_uninit(ns);
err_utn_destroy:
rtnl_unlock();
nsim_udp_tunnels_info_destroy(ns->netdev);
@@ -798,6 +985,9 @@ nsim_create(struct nsim_dev *nsim_dev, struct nsim_dev_port *nsim_dev_port)
ns->pp_dfs = debugfs_create_file("pp_hold", 0600, nsim_dev_port->ddir,
ns, &nsim_pp_hold_fops);
+ ns->qr_dfs = debugfs_create_file("queue_reset", 0200,
+ nsim_dev_port->ddir, ns,
+ &nsim_qreset_fops);
return ns;
@@ -811,6 +1001,7 @@ void nsim_destroy(struct netdevsim *ns)
struct net_device *dev = ns->netdev;
struct netdevsim *peer;
+ debugfs_remove(ns->qr_dfs);
debugfs_remove(ns->pp_dfs);
rtnl_lock();
@@ -823,7 +1014,7 @@ void nsim_destroy(struct netdevsim *ns)
nsim_macsec_teardown(ns);
nsim_ipsec_teardown(ns);
nsim_bpf_uninit(ns);
- nsim_queue_free(ns);
+ nsim_queue_uninit(ns);
}
rtnl_unlock();
if (nsim_dev_port_is_pf(ns->nsim_dev_port))
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index bf02efa10956..a70f62af4c88 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -101,7 +101,9 @@ struct netdevsim {
struct nsim_dev *nsim_dev;
struct nsim_dev_port *nsim_dev_port;
struct mock_phc *phc;
- struct nsim_rq *rq;
+ struct nsim_rq **rq;
+
+ int rq_reset_mode;
u64 tx_packets;
u64 tx_bytes;
@@ -134,6 +136,7 @@ struct netdevsim {
struct page *page;
struct dentry *pp_dfs;
+ struct dentry *qr_dfs;
struct nsim_ethtool ethtool;
struct netdevsim __rcu *peer;
diff --git a/net/core/dev.c b/net/core/dev.c
index 983c24927316..26f0c2fbb8aa 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6736,13 +6736,14 @@ static void napi_restore_config(struct napi_struct *n)
n->gro_flush_timeout = n->config->gro_flush_timeout;
n->irq_suspend_timeout = n->config->irq_suspend_timeout;
/* a NAPI ID might be stored in the config, if so use it. if not, use
- * napi_hash_add to generate one for us. It will be saved to the config
- * in napi_disable.
+ * napi_hash_add to generate one for us.
*/
- if (n->config->napi_id)
+ if (n->config->napi_id) {
napi_hash_add_with_id(n, n->config->napi_id);
- else
+ } else {
napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
}
static void napi_save_config(struct napi_struct *n)
@@ -6750,10 +6751,39 @@ static void napi_save_config(struct napi_struct *n)
n->config->defer_hard_irqs = n->defer_hard_irqs;
n->config->gro_flush_timeout = n->gro_flush_timeout;
n->config->irq_suspend_timeout = n->irq_suspend_timeout;
- n->config->napi_id = n->napi_id;
napi_hash_del(n);
}
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (pos->napi_id >= MIN_NAPI_ID)
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
int (*poll)(struct napi_struct *, int), int weight)
{
@@ -6780,7 +6810,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
+ netif_napi_dev_list_add(dev, napi);
/* default settings from sysfs are applied to all NAPIs. any per-NAPI
* configuration will be loaded in napi_enable
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index e217a5838c87..db82786fa0c4 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -79,3 +79,4 @@ err_free_new_mem:
return err;
}
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
diff --git a/tools/testing/selftests/net/nl_netdev.py b/tools/testing/selftests/net/nl_netdev.py
index 93d9d914529b..93e8cb671c3d 100755
--- a/tools/testing/selftests/net/nl_netdev.py
+++ b/tools/testing/selftests/net/nl_netdev.py
@@ -18,6 +18,23 @@ def lo_check(nf) -> None:
ksft_eq(len(lo_info['xdp-rx-metadata-features']), 0)
+def napi_list_check(nf) -> None:
+ with NetdevSimDev(queue_count=100) as nsimdev:
+ nsim = nsimdev.nsims[0]
+
+ ip(f"link set dev {nsim.ifname} up")
+
+ napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+ ksft_eq(len(napis), 100)
+
+ for q in [50, 0, 99]:
+ for i in range(4):
+ nsim.dfs_write("queue_reset", f"{q} {i}")
+ napis = nf.napi_get({'ifindex': nsim.ifindex}, dump=True)
+ ksft_eq(len(napis), 100,
+ comment=f"queue count after reset queue {q} mode {i}")
+
+
def page_pool_check(nf) -> None:
with NetdevSimDev() as nsimdev:
nsim = nsimdev.nsims[0]
@@ -89,7 +106,7 @@ def page_pool_check(nf) -> None:
def main() -> None:
nf = NetdevFamily()
- ksft_run([empty_check, lo_check, page_pool_check],
+ ksft_run([empty_check, lo_check, page_pool_check, napi_list_check],
args=(nf, ))
ksft_exit()