summaryrefslogtreecommitdiff
path: root/drivers/net/virtio_net.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 08:28:08 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-21 08:28:08 -0800
commitfcc79e1714e8c2b8e216dc3149812edd37884eef (patch)
tree17a51d29db810b81412be040aaf380936b3261b4 /drivers/net/virtio_net.c
parent6e95ef0258ff4ee23ae3b06bf6b00b33dbbd5ef7 (diff)
parentdd7207838d38780b51e4690ee508ab2d5057e099 (diff)
Merge tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking updates from Paolo Abeni: "The most significant set of changes is the per netns RTNL. The new behavior is disabled by default, regression risk should be contained. Notably the new config knob PTP_1588_CLOCK_VMCLOCK will inherit its default value from PTP_1588_CLOCK_KVM, as the first is intended to be a more reliable replacement for the latter. Core: - Started a very large, in-progress, effort to make the RTNL lock scope per network-namespace, thus reducing the lock contention significantly in the containerized use-case, comprising: - RCU-ified some relevant slices of the FIB control path - introduce basic per netns locking helpers - namespacified the IPv4 address hash table - remove rtnl_register{,_module}() in favour of rtnl_register_many() - refactor rtnl_{new,del,set}link() moving as much validation as possible out of RTNL lock - convert all phonet doit() and dumpit() handlers to RCU - convert IPv4 addresses manipulation to per-netns RTNL - convert virtual interface creation to per-netns RTNL the per-netns lock infrastructure is guarded by the CONFIG_DEBUG_NET_SMALL_RTNL knob, disabled by default ad interim. - Introduce NAPI suspension, to efficiently switching between busy polling (NAPI processing suspended) and normal processing. - Migrate the IPv4 routing input, output and control path from direct ToS usage to DSCP macros. This is a work in progress to make ECN handling consistent and reliable. - Add drop reasons support to the IPv4 rotue input path, allowing better introspection in case of packets drop. - Make FIB seqnum lockless, dropping RTNL protection for read access. - Make inet{,v6} addresses hashing less predicable. - Allow providing timestamp OPT_ID via cmsg, to correlate TX packets and timestamps Things we sprinkled into general kernel code: - Add small file operations for debugfs, to reduce the struct ops size. - Refactoring and optimization for the implementation of page_frag API, This is a preparatory work to consolidate the page_frag implementation. Netfilter: - Optimize set element transactions to reduce memory consumption - Extended netlink error reporting for attribute parser failure. - Make legacy xtables configs user selectable, giving users the option to configure iptables without enabling any other config. - Address a lot of false-positive RCU issues, pointed by recent CI improvements. BPF: - Put xsk sockets on a struct diet and add various cleanups. Overall, this helps to bump performance by 12% for some workloads. - Extend BPF selftests to increase coverage of XDP features in combination with BPF cpumap. - Optimize and homogenize bpf_csum_diff helper for all archs and also add a batch of new BPF selftests for it. - Extend netkit with an option to delegate skb->{mark,priority} scrubbing to its BPF program. - Make the bpf_get_netns_cookie() helper available also to tc(x) BPF programs. Protocols: - Introduces 4-tuple hash for connected udp sockets, speeding-up significantly connected sockets lookup. - Add a fastpath for some TCP timers that usually expires after close, the socket lock contention. - Add inbound and outbound xfrm state caches to speed up state lookups. - Avoid sending MPTCP advertisements on stale subflows, reducing risks on loosing them. - Make neighbours table flushing more scalable, maintaining per device neigh lists. Driver API: - Introduce a unified interface to configure transmission H/W shaping, and expose it to user-space via generic-netlink. - Add support for per-NAPI config via netlink. This makes napi configuration persistent across queues removal and re-creation. Requires driver updates, currently supported drivers are: nVidia/Mellanox mlx4 and mlx5, Broadcom brcm and Intel ice. - Add ethtool support for writing SFP / PHY firmware blocks. - Track RSS context allocation from ethtool core. - Implement support for mirroring to DSA CPU port, via TC mirror offload. - Consolidate FDB updates notification, to avoid duplicates on device-specific entries. - Expose DPLL clock quality level to the user-space. - Support master-slave PHY config via device tree. Tests and tooling: - forwarding: introduce deferred commands, to simplify the cleanup phase Drivers: - Updated several drivers - Amazon vNic, Google vNic, Microsoft vNic, Intel e1000e and Broadcom Tigon3 - to use netdev-genl to link the IRQs and queues to NAPI IDs, allowing busy polling and better introspection. - Ethernet high-speed NICs: - nVidia/Mellanox: - mlx5: - a large refactor to implement support for cross E-Switch scheduling - refactor H/W conter management to let it scale better - H/W GRO cleanups - Intel (100G, ice):: - add support for ethtool reset - implement support for per TX queue H/W shaping - AMD/Solarflare: - implement per device queue stats support - Broadcom (bnxt): - improve wildcard l4proto on IPv4/IPv6 ntuple rules - Marvell Octeon: - Add representor support for each Resource Virtualization Unit (RVU) device. - Hisilicon: - add support for the BMC Gigabit Ethernet - IBM (EMAC): - driver cleanup and modernization - Cisco (VIC): - raise the queues number limit to 256 - Ethernet virtual: - Google vNIC: - implement page pool support - macsec: - inherit lower device's features and TSO limits when offloading - virtio_net: - enable premapped mode by default - support for XDP socket(AF_XDP) zerocopy TX - wireguard: - set the TSO max size to be GSO_MAX_SIZE, to aggregate larger packets. - Ethernet NICs embedded and virtual: - Broadcom ASP: - enable software timestamping - Freescale: - add enetc4 PF driver - MediaTek: Airoha SoC: - implement BQL support - RealTek r8169: - enable TSO by default on r8168/r8125 - implement extended ethtool stats - Renesas AVB: - enable TX checksum offload - Synopsys (stmmac): - support header splitting for vlan tagged packets - move common code for DWMAC4 and DWXGMAC into a separate FPE module. - add dwmac driver support for T-HEAD TH1520 SoC - Synopsys (xpcs): - driver refactor and cleanup - TI: - icssg_prueth: add VLAN offload support - Xilinx emaclite: - add clock support - Ethernet switches: - Microchip: - implement support for the lan969x Ethernet switch family - add LAN9646 switch support to KSZ DSA driver - Ethernet PHYs: - Marvel: 88q2x: enable auto negotiation - Microchip: add support for LAN865X Rev B1 and LAN867X Rev C1/C2 - PTP: - Add support for the Amazon virtual clock device - Add PtP driver for s390 clocks - WiFi: - mac80211 - EHT 1024 aggregation size for transmissions - new operation to indicate that a new interface is to be added - support radio separation of multi-band devices - move wireless extension spy implementation to libiw - Broadcom: - brcmfmac: optional LPO clock support - Microchip: - add support for Atmel WILC3000 - Qualcomm (ath12k): - firmware coredump collection support - add debugfs support for a multitude of statistics - Qualcomm (ath5k): - Arcadyan ARV45XX AR2417 & Gigaset SX76[23] AR241[34]A support - Realtek: - rtw88: 8821au and 8812au USB adapters support - rtw89: add thermal protection - rtw89: fine tune BT-coexsitence to improve user experience - rtw89: firmware secure boot for WiFi 6 chip - Bluetooth - add Qualcomm WCN785x support for ids Foxconn 0xe0fc/0xe0f3 and 0x13d3:0x3623 - add Realtek RTL8852BE support for id Foxconn 0xe123 - add MediaTek MT7920 support for wireless module ids - btintel_pcie: add handshake between driver and firmware - btintel_pcie: add recovery mechanism - btnxpuart: add GPIO support to power save feature" * tag 'net-next-6.13' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1475 commits) mm: page_frag: fix a compile error when kernel is not compiled Documentation: tipc: fix formatting issue in tipc.rst selftests: nic_performance: Add selftest for performance of NIC driver selftests: nic_link_layer: Add selftest case for speed and duplex states selftests: nic_link_layer: Add link layer selftest for NIC driver bnxt_en: Add FW trace coredump segments to the coredump bnxt_en: Add a new ethtool -W dump flag bnxt_en: Add 2 parameters to bnxt_fill_coredump_seg_hdr() bnxt_en: Add functions to copy host context memory bnxt_en: Do not free FW log context memory bnxt_en: Manage the FW trace context memory bnxt_en: Allocate backing store memory for FW trace logs bnxt_en: Add a 'force' parameter to bnxt_free_ctx_mem() bnxt_en: Refactor bnxt_free_ctx_mem() bnxt_en: Add mem_valid bit to struct bnxt_ctx_mem_type bnxt_en: Update firmware interface spec to 1.10.3.85 selftests/bpf: Add some tests with sockmap SK_PASS bpf: fix recursive lock when verdict program return SK_PASS wireguard: device: support big tcp GSO wireguard: selftests: load nf_conntrack if not present ...
Diffstat (limited to 'drivers/net/virtio_net.c')
-rw-r--r--drivers/net/virtio_net.c458
1 files changed, 351 insertions, 107 deletions
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 53a038fcbe991..64c87bb48a41c 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -45,9 +45,6 @@ module_param(napi_tx, bool, 0644);
#define VIRTIO_XDP_TX BIT(0)
#define VIRTIO_XDP_REDIR BIT(1)
-#define VIRTIO_XDP_FLAG BIT(0)
-#define VIRTIO_ORPHAN_FLAG BIT(1)
-
/* RX packet size EWMA. The average packet size is used to determine the packet
* buffer size when refilling RX rings. As the entire RX ring may be refilled
* at once, the weight is chosen so that the EWMA will be insensitive to short-
@@ -86,6 +83,7 @@ struct virtnet_sq_free_stats {
u64 bytes;
u64 napi_packets;
u64 napi_bytes;
+ u64 xsk;
};
struct virtnet_sq_stats {
@@ -298,6 +296,10 @@ struct send_queue {
/* Record whether sq is in reset state. */
bool reset;
+
+ struct xsk_buff_pool *xsk_pool;
+
+ dma_addr_t xsk_hdr_dma_addr;
};
/* Internal representation of a receive virtqueue */
@@ -356,9 +358,6 @@ struct receive_queue {
struct xdp_rxq_info xsk_rxq_info;
struct xdp_buff **xsk_buffs;
-
- /* Do dma by self */
- bool do_dma;
};
/* This structure can contain rss message with maximum settings for indirection table and keysize
@@ -501,6 +500,8 @@ struct virtio_net_common_hdr {
};
};
+static struct virtio_net_common_hdr xsk_hdr;
+
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf);
static int virtnet_xdp_handler(struct bpf_prog *xdp_prog, struct xdp_buff *xdp,
struct net_device *dev,
@@ -512,6 +513,14 @@ static struct sk_buff *virtnet_skb_append_frag(struct sk_buff *head_skb,
struct sk_buff *curr_skb,
struct page *page, void *buf,
int len, int truesize);
+static void virtnet_xsk_completed(struct send_queue *sq, int num);
+
+enum virtnet_xmit_type {
+ VIRTNET_XMIT_TYPE_SKB,
+ VIRTNET_XMIT_TYPE_SKB_ORPHAN,
+ VIRTNET_XMIT_TYPE_XDP,
+ VIRTNET_XMIT_TYPE_XSK,
+};
static int rss_indirection_table_alloc(struct virtio_net_ctrl_rss *rss, u16 indir_table_size)
{
@@ -532,67 +541,99 @@ static void rss_indirection_table_free(struct virtio_net_ctrl_rss *rss)
kfree(rss->indirection_table);
}
-static bool is_xdp_frame(void *ptr)
-{
- return (unsigned long)ptr & VIRTIO_XDP_FLAG;
-}
+/* We use the last two bits of the pointer to distinguish the xmit type. */
+#define VIRTNET_XMIT_TYPE_MASK (BIT(0) | BIT(1))
+
+#define VIRTIO_XSK_FLAG_OFFSET 2
-static void *xdp_to_ptr(struct xdp_frame *ptr)
+static enum virtnet_xmit_type virtnet_xmit_ptr_unpack(void **ptr)
{
- return (void *)((unsigned long)ptr | VIRTIO_XDP_FLAG);
+ unsigned long p = (unsigned long)*ptr;
+
+ *ptr = (void *)(p & ~VIRTNET_XMIT_TYPE_MASK);
+
+ return p & VIRTNET_XMIT_TYPE_MASK;
}
-static struct xdp_frame *ptr_to_xdp(void *ptr)
+static void *virtnet_xmit_ptr_pack(void *ptr, enum virtnet_xmit_type type)
{
- return (struct xdp_frame *)((unsigned long)ptr & ~VIRTIO_XDP_FLAG);
+ return (void *)((unsigned long)ptr | type);
}
-static bool is_orphan_skb(void *ptr)
+static int virtnet_add_outbuf(struct send_queue *sq, int num, void *data,
+ enum virtnet_xmit_type type)
{
- return (unsigned long)ptr & VIRTIO_ORPHAN_FLAG;
+ return virtqueue_add_outbuf(sq->vq, sq->sg, num,
+ virtnet_xmit_ptr_pack(data, type),
+ GFP_ATOMIC);
}
-static void *skb_to_ptr(struct sk_buff *skb, bool orphan)
+static u32 virtnet_ptr_to_xsk_buff_len(void *ptr)
{
- return (void *)((unsigned long)skb | (orphan ? VIRTIO_ORPHAN_FLAG : 0));
+ return ((unsigned long)ptr) >> VIRTIO_XSK_FLAG_OFFSET;
}
-static struct sk_buff *ptr_to_skb(void *ptr)
+static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
{
- return (struct sk_buff *)((unsigned long)ptr & ~VIRTIO_ORPHAN_FLAG);
+ sg_dma_address(sg) = addr;
+ sg_dma_len(sg) = len;
}
static void __free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
bool in_napi, struct virtnet_sq_free_stats *stats)
{
+ struct xdp_frame *frame;
+ struct sk_buff *skb;
unsigned int len;
void *ptr;
while ((ptr = virtqueue_get_buf(sq->vq, &len)) != NULL) {
- if (!is_xdp_frame(ptr)) {
- struct sk_buff *skb = ptr_to_skb(ptr);
+ switch (virtnet_xmit_ptr_unpack(&ptr)) {
+ case VIRTNET_XMIT_TYPE_SKB:
+ skb = ptr;
pr_debug("Sent skb %p\n", skb);
+ stats->napi_packets++;
+ stats->napi_bytes += skb->len;
+ napi_consume_skb(skb, in_napi);
+ break;
- if (is_orphan_skb(ptr)) {
- stats->packets++;
- stats->bytes += skb->len;
- } else {
- stats->napi_packets++;
- stats->napi_bytes += skb->len;
- }
+ case VIRTNET_XMIT_TYPE_SKB_ORPHAN:
+ skb = ptr;
+
+ stats->packets++;
+ stats->bytes += skb->len;
napi_consume_skb(skb, in_napi);
- } else {
- struct xdp_frame *frame = ptr_to_xdp(ptr);
+ break;
+
+ case VIRTNET_XMIT_TYPE_XDP:
+ frame = ptr;
stats->packets++;
stats->bytes += xdp_get_frame_len(frame);
xdp_return_frame(frame);
+ break;
+
+ case VIRTNET_XMIT_TYPE_XSK:
+ stats->bytes += virtnet_ptr_to_xsk_buff_len(ptr);
+ stats->xsk++;
+ break;
}
}
netdev_tx_completed_queue(txq, stats->napi_packets, stats->napi_bytes);
}
+static void virtnet_free_old_xmit(struct send_queue *sq,
+ struct netdev_queue *txq,
+ bool in_napi,
+ struct virtnet_sq_free_stats *stats)
+{
+ __free_old_xmit(sq, txq, in_napi, stats);
+
+ if (stats->xsk)
+ virtnet_xsk_completed(sq, stats->xsk);
+}
+
/* Converting between virtqueue no. and kernel tx/rx queue no.
* 0:rx0 1:tx0 2:rx1 3:tx1 ... 2N:rxN 2N+1:txN 2N+2:cvq
*/
@@ -876,11 +917,14 @@ ok:
static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len)
{
+ struct virtnet_info *vi = rq->vq->vdev->priv;
struct page *page = virt_to_head_page(buf);
struct virtnet_rq_dma *dma;
void *head;
int offset;
+ BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs);
+
head = page_address(page);
dma = head;
@@ -905,10 +949,13 @@ static void virtnet_rq_unmap(struct receive_queue *rq, void *buf, u32 len)
static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
{
+ struct virtnet_info *vi = rq->vq->vdev->priv;
void *buf;
+ BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs);
+
buf = virtqueue_get_buf_ctx(rq->vq, len, ctx);
- if (buf && rq->do_dma)
+ if (buf)
virtnet_rq_unmap(rq, buf, *len);
return buf;
@@ -916,15 +963,13 @@ static void *virtnet_rq_get_buf(struct receive_queue *rq, u32 *len, void **ctx)
static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len)
{
+ struct virtnet_info *vi = rq->vq->vdev->priv;
struct virtnet_rq_dma *dma;
dma_addr_t addr;
u32 offset;
void *head;
- if (!rq->do_dma) {
- sg_init_one(rq->sg, buf, len);
- return;
- }
+ BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs);
head = page_address(rq->alloc_frag.page);
@@ -935,60 +980,57 @@ static void virtnet_rq_init_one_sg(struct receive_queue *rq, void *buf, u32 len)
addr = dma->addr - sizeof(*dma) + offset;
sg_init_table(rq->sg, 1);
- rq->sg[0].dma_address = addr;
- rq->sg[0].length = len;
+ sg_fill_dma(rq->sg, addr, len);
}
static void *virtnet_rq_alloc(struct receive_queue *rq, u32 size, gfp_t gfp)
{
struct page_frag *alloc_frag = &rq->alloc_frag;
+ struct virtnet_info *vi = rq->vq->vdev->priv;
struct virtnet_rq_dma *dma;
void *buf, *head;
dma_addr_t addr;
- if (unlikely(!skb_page_frag_refill(size, alloc_frag, gfp)))
- return NULL;
+ BUG_ON(vi->big_packets && !vi->mergeable_rx_bufs);
head = page_address(alloc_frag->page);
- if (rq->do_dma) {
- dma = head;
-
- /* new pages */
- if (!alloc_frag->offset) {
- if (rq->last_dma) {
- /* Now, the new page is allocated, the last dma
- * will not be used. So the dma can be unmapped
- * if the ref is 0.
- */
- virtnet_rq_unmap(rq, rq->last_dma, 0);
- rq->last_dma = NULL;
- }
+ dma = head;
- dma->len = alloc_frag->size - sizeof(*dma);
+ /* new pages */
+ if (!alloc_frag->offset) {
+ if (rq->last_dma) {
+ /* Now, the new page is allocated, the last dma
+ * will not be used. So the dma can be unmapped
+ * if the ref is 0.
+ */
+ virtnet_rq_unmap(rq, rq->last_dma, 0);
+ rq->last_dma = NULL;
+ }
- addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1,
- dma->len, DMA_FROM_DEVICE, 0);
- if (virtqueue_dma_mapping_error(rq->vq, addr))
- return NULL;
+ dma->len = alloc_frag->size - sizeof(*dma);
- dma->addr = addr;
- dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr);
+ addr = virtqueue_dma_map_single_attrs(rq->vq, dma + 1,
+ dma->len, DMA_FROM_DEVICE, 0);
+ if (virtqueue_dma_mapping_error(rq->vq, addr))
+ return NULL;
- /* Add a reference to dma to prevent the entire dma from
- * being released during error handling. This reference
- * will be freed after the pages are no longer used.
- */
- get_page(alloc_frag->page);
- dma->ref = 1;
- alloc_frag->offset = sizeof(*dma);
+ dma->addr = addr;
+ dma->need_sync = virtqueue_dma_need_sync(rq->vq, addr);
- rq->last_dma = dma;
- }
+ /* Add a reference to dma to prevent the entire dma from
+ * being released during error handling. This reference
+ * will be freed after the pages are no longer used.
+ */
+ get_page(alloc_frag->page);
+ dma->ref = 1;
+ alloc_frag->offset = sizeof(*dma);
- ++dma->ref;
+ rq->last_dma = dma;
}
+ ++dma->ref;
+
buf = head + alloc_frag->offset;
get_page(alloc_frag->page);
@@ -1010,7 +1052,7 @@ static void virtnet_rq_unmap_free_buf(struct virtqueue *vq, void *buf)
return;
}
- if (rq->do_dma)
+ if (!vi->big_packets || vi->mergeable_rx_bufs)
virtnet_rq_unmap(rq, buf, 0);
virtnet_rq_free_buf(vi, rq, buf);
@@ -1021,7 +1063,7 @@ static void free_old_xmit(struct send_queue *sq, struct netdev_queue *txq,
{
struct virtnet_sq_free_stats stats = {0};
- __free_old_xmit(sq, txq, in_napi, &stats);
+ virtnet_free_old_xmit(sq, txq, in_napi, &stats);
/* Avoid overhead when no packets have been processed
* happens when called speculatively from start_xmit.
@@ -1088,12 +1130,6 @@ static void check_sq_full_and_disable(struct virtnet_info *vi,
}
}
-static void sg_fill_dma(struct scatterlist *sg, dma_addr_t addr, u32 len)
-{
- sg->dma_address = addr;
- sg->length = len;
-}
-
static struct xdp_buff *buf_to_xdp(struct virtnet_info *vi,
struct receive_queue *rq, void *buf, u32 len)
{
@@ -1374,7 +1410,8 @@ static int virtnet_add_recvbuf_xsk(struct virtnet_info *vi, struct receive_queue
sg_init_table(rq->sg, 1);
sg_fill_dma(rq->sg, addr, len);
- err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, xsk_buffs[i], gfp);
+ err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1,
+ xsk_buffs[i], NULL, gfp);
if (err)
goto err;
}
@@ -1388,6 +1425,120 @@ err:
return err;
}
+static void *virtnet_xsk_to_ptr(u32 len)
+{
+ unsigned long p;
+
+ p = len << VIRTIO_XSK_FLAG_OFFSET;
+
+ return virtnet_xmit_ptr_pack((void *)p, VIRTNET_XMIT_TYPE_XSK);
+}
+
+static int virtnet_xsk_xmit_one(struct send_queue *sq,
+ struct xsk_buff_pool *pool,
+ struct xdp_desc *desc)
+{
+ struct virtnet_info *vi;
+ dma_addr_t addr;
+
+ vi = sq->vq->vdev->priv;
+
+ addr = xsk_buff_raw_get_dma(pool, desc->addr);
+ xsk_buff_raw_dma_sync_for_device(pool, addr, desc->len);
+
+ sg_init_table(sq->sg, 2);
+ sg_fill_dma(sq->sg, sq->xsk_hdr_dma_addr, vi->hdr_len);
+ sg_fill_dma(sq->sg + 1, addr, desc->len);
+
+ return virtqueue_add_outbuf_premapped(sq->vq, sq->sg, 2,
+ virtnet_xsk_to_ptr(desc->len),
+ GFP_ATOMIC);
+}
+
+static int virtnet_xsk_xmit_batch(struct send_queue *sq,
+ struct xsk_buff_pool *pool,
+ unsigned int budget,
+ u64 *kicks)
+{
+ struct xdp_desc *descs = pool->tx_descs;
+ bool kick = false;
+ u32 nb_pkts, i;
+ int err;
+
+ budget = min_t(u32, budget, sq->vq->num_free);
+
+ nb_pkts = xsk_tx_peek_release_desc_batch(pool, budget);
+ if (!nb_pkts)
+ return 0;
+
+ for (i = 0; i < nb_pkts; i++) {
+ err = virtnet_xsk_xmit_one(sq, pool, &descs[i]);
+ if (unlikely(err)) {
+ xsk_tx_completed(sq->xsk_pool, nb_pkts - i);
+ break;
+ }
+
+ kick = true;
+ }
+
+ if (kick && virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq))
+ (*kicks)++;
+
+ return i;
+}
+
+static bool virtnet_xsk_xmit(struct send_queue *sq, struct xsk_buff_pool *pool,
+ int budget)
+{
+ struct virtnet_info *vi = sq->vq->vdev->priv;
+ struct virtnet_sq_free_stats stats = {};
+ struct net_device *dev = vi->dev;
+ u64 kicks = 0;
+ int sent;
+
+ /* Avoid to wakeup napi meanless, so call __free_old_xmit instead of
+ * free_old_xmit().
+ */
+ __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq), true, &stats);
+
+ if (stats.xsk)
+ xsk_tx_completed(sq->xsk_pool, stats.xsk);
+
+ sent = virtnet_xsk_xmit_batch(sq, pool, budget, &kicks);
+
+ if (!is_xdp_raw_buffer_queue(vi, sq - vi->sq))
+ check_sq_full_and_disable(vi, vi->dev, sq);
+
+ if (sent) {
+ struct netdev_queue *txq;
+
+ txq = netdev_get_tx_queue(vi->dev, sq - vi->sq);
+ txq_trans_cond_update(txq);
+ }
+
+ u64_stats_update_begin(&sq->stats.syncp);
+ u64_stats_add(&sq->stats.packets, stats.packets);
+ u64_stats_add(&sq->stats.bytes, stats.bytes);
+ u64_stats_add(&sq->stats.kicks, kicks);
+ u64_stats_add(&sq->stats.xdp_tx, sent);
+ u64_stats_update_end(&sq->stats.syncp);
+
+ if (xsk_uses_need_wakeup(pool))
+ xsk_set_tx_need_wakeup(pool);
+
+ return sent;
+}
+
+static void xsk_wakeup(struct send_queue *sq)
+{
+ if (napi_if_scheduled_mark_missed(&sq->napi))
+ return;
+
+ local_bh_disable();
+ virtqueue_napi_schedule(&sq->napi, sq->vq);
+ local_bh_enable();
+}
+
static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
{
struct virtnet_info *vi = netdev_priv(dev);
@@ -1401,14 +1552,19 @@ static int virtnet_xsk_wakeup(struct net_device *dev, u32 qid, u32 flag)
sq = &vi->sq[qid];
- if (napi_if_scheduled_mark_missed(&sq->napi))
- return 0;
+ xsk_wakeup(sq);
+ return 0;
+}
- local_bh_disable();
- virtqueue_napi_schedule(&sq->napi, sq->vq);
- local_bh_enable();
+static void virtnet_xsk_completed(struct send_queue *sq, int num)
+{
+ xsk_tx_completed(sq->xsk_pool, num);
- return 0;
+ /* If this is called by rx poll, start_xmit and xdp xmit we should
+ * wakeup the tx napi to consume the xsk tx queue, because the tx
+ * interrupt may not be triggered.
+ */
+ xsk_wakeup(sq);
}
static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
@@ -1451,8 +1607,7 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
skb_frag_size(frag), skb_frag_off(frag));
}
- err = virtqueue_add_outbuf(sq->vq, sq->sg, nr_frags + 1,
- xdp_to_ptr(xdpf), GFP_ATOMIC);
+ err = virtnet_add_outbuf(sq, nr_frags + 1, xdpf, VIRTNET_XMIT_TYPE_XDP);
if (unlikely(err))
return -ENOSPC; /* Caller handle free/refcnt */
@@ -1525,8 +1680,8 @@ static int virtnet_xdp_xmit(struct net_device *dev,
}
/* Free up any pending old buffers before queueing new ones. */
- __free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq),
- false, &stats);
+ virtnet_free_old_xmit(sq, netdev_get_tx_queue(dev, sq - vi->sq),
+ false, &stats);
for (i = 0; i < n; i++) {
struct xdp_frame *xdpf = frames[i];
@@ -2443,6 +2598,9 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
len = SKB_DATA_ALIGN(len) +
SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ if (unlikely(!skb_page_frag_refill(len, &rq->alloc_frag, gfp)))
+ return -ENOMEM;
+
buf = virtnet_rq_alloc(rq, len, gfp);
if (unlikely(!buf))
return -ENOMEM;
@@ -2451,10 +2609,9 @@ static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
virtnet_rq_init_one_sg(rq, buf, vi->hdr_len + GOOD_PACKET_LEN);
- err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
+ err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp);
if (err < 0) {
- if (rq->do_dma)
- virtnet_rq_unmap(rq, buf, 0);
+ virtnet_rq_unmap(rq, buf, 0);
put_page(virt_to_head_page(buf));
}
@@ -2545,6 +2702,12 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
*/
len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
+ if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
+ return -ENOMEM;
+
+ if (!alloc_frag->offset && len + room + sizeof(struct virtnet_rq_dma) > alloc_frag->size)
+ len -= sizeof(struct virtnet_rq_dma);
+
buf = virtnet_rq_alloc(rq, len + room, gfp);
if (unlikely(!buf))
return -ENOMEM;
@@ -2566,10 +2729,9 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
virtnet_rq_init_one_sg(rq, buf, len);
ctx = mergeable_len_to_ctx(len + room, headroom);
- err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
+ err = virtqueue_add_inbuf_premapped(rq->vq, rq->sg, 1, buf, ctx, gfp);
if (err < 0) {
- if (rq->do_dma)
- virtnet_rq_unmap(rq, buf, 0);
+ virtnet_rq_unmap(rq, buf, 0);
put_page(virt_to_head_page(buf));
}
@@ -2726,7 +2888,7 @@ static int virtnet_receive_packets(struct virtnet_info *vi,
}
} else {
while (packets < budget &&
- (buf = virtnet_rq_get_buf(rq, &len, NULL)) != NULL) {
+ (buf = virtqueue_get_buf(rq->vq, &len)) != NULL) {
receive_buf(vi, rq, buf, len, NULL, xdp_xmit, stats);
packets++;
}
@@ -2824,7 +2986,7 @@ static void virtnet_rx_dim_update(struct virtnet_info *vi, struct receive_queue
u64_stats_read(&rq->stats.bytes),
&cur_sample);
- net_dim(&rq->dim, cur_sample);
+ net_dim(&rq->dim, &cur_sample);
rq->packets_in_napi = 0;
}
@@ -2976,7 +3138,7 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
struct virtnet_info *vi = sq->vq->vdev->priv;
unsigned int index = vq2txq(sq->vq);
struct netdev_queue *txq;
- int opaque;
+ int opaque, xsk_done = 0;
bool done;
if (unlikely(is_xdp_raw_buffer_queue(vi, index))) {
@@ -2988,7 +3150,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
txq = netdev_get_tx_queue(vi->dev, index);
__netif_tx_lock(txq, raw_smp_processor_id());
virtqueue_disable_cb(sq->vq);
- free_old_xmit(sq, txq, !!budget);
+
+ if (sq->xsk_pool)
+ xsk_done = virtnet_xsk_xmit(sq, sq->xsk_pool, budget);
+ else
+ free_old_xmit(sq, txq, !!budget);
if (sq->vq->num_free >= 2 + MAX_SKB_FRAGS) {
if (netif_tx_queue_stopped(txq)) {
@@ -2999,6 +3165,11 @@ static int virtnet_poll_tx(struct napi_struct *napi, int budget)
netif_tx_wake_queue(txq);
}
+ if (xsk_done >= budget) {
+ __netif_tx_unlock(txq);
+ return budget;
+ }
+
opaque = virtqueue_enable_cb_prepare(sq->vq);
done = napi_complete_done(napi, 0);
@@ -3066,8 +3237,9 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb, bool orphan)
return num_sg;
num_sg++;
}
- return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
- skb_to_ptr(skb, orphan), GFP_ATOMIC);
+
+ return virtnet_add_outbuf(sq, num_sg, skb,
+ orphan ? VIRTNET_XMIT_TYPE_SKB_ORPHAN : VIRTNET_XMIT_TYPE_SKB);
}
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -5072,7 +5244,7 @@ static int virtnet_set_coalesce(struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct virtnet_info *vi = netdev_priv(dev);
- int ret, queue_number, napi_weight;
+ int ret, queue_number, napi_weight, i;
bool update_napi = false;
/* Can't change NAPI weight if the link is up */
@@ -5101,6 +5273,14 @@ static int virtnet_set_coalesce(struct net_device *dev,
return ret;
if (update_napi) {
+ /* xsk xmit depends on the tx napi. So if xsk is active,
+ * prevent modifications to tx napi.
+ */
+ for (i = queue_number; i < vi->max_queue_pairs; i++) {
+ if (vi->sq[i].xsk_pool)
+ return -EBUSY;
+ }
+
for (; queue_number < vi->max_queue_pairs; queue_number++)
vi->sq[queue_number].napi.weight = napi_weight;
}
@@ -5549,6 +5729,29 @@ unreg:
return err;
}
+static int virtnet_sq_bind_xsk_pool(struct virtnet_info *vi,
+ struct send_queue *sq,
+ struct xsk_buff_pool *pool)
+{
+ int err, qindex;
+
+ qindex = sq - vi->sq;
+
+ virtnet_tx_pause(vi, sq);
+
+ err = virtqueue_reset(sq->vq, virtnet_sq_free_unused_buf);
+ if (err) {
+ netdev_err(vi->dev, "reset tx fail: tx queue index: %d err: %d\n", qindex, err);
+ pool = NULL;
+ }
+
+ sq->xsk_pool = pool;
+
+ virtnet_tx_resume(vi, sq);
+
+ return err;
+}
+
static int virtnet_xsk_pool_enable(struct net_device *dev,
struct xsk_buff_pool *pool,
u16 qid)
@@ -5557,6 +5760,7 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
struct receive_queue *rq;
struct device *dma_dev;
struct send_queue *sq;
+ dma_addr_t hdr_dma;
int err, size;
if (vi->hdr_len > xsk_pool_get_headroom(pool))
@@ -5594,6 +5798,11 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
if (!rq->xsk_buffs)
return -ENOMEM;
+ hdr_dma = virtqueue_dma_map_single_attrs(sq->vq, &xsk_hdr, vi->hdr_len,
+ DMA_TO_DEVICE, 0);
+ if (virtqueue_dma_mapping_error(sq->vq, hdr_dma))
+ return -ENOMEM;
+
err = xsk_pool_dma_map(pool, dma_dev, 0);
if (err)
goto err_xsk_map;
@@ -5602,11 +5811,24 @@ static int virtnet_xsk_pool_enable(struct net_device *dev,
if (err)
goto err_rq;
+ err = virtnet_sq_bind_xsk_pool(vi, sq, pool);
+ if (err)
+ goto err_sq;
+
+ /* Now, we do not support tx offload(such as tx csum), so all the tx
+ * virtnet hdr is zero. So all the tx packets can share a single hdr.
+ */
+ sq->xsk_hdr_dma_addr = hdr_dma;
+
return 0;
+err_sq:
+ virtnet_rq_bind_xsk_pool(vi, rq, NULL);
err_rq:
xsk_pool_dma_unmap(pool, 0);
err_xsk_map:
+ virtqueue_dma_unmap_single_attrs(rq->vq, hdr_dma, vi->hdr_len,
+ DMA_TO_DEVICE, 0);
return err;
}
@@ -5615,19 +5837,24 @@ static int virtnet_xsk_pool_disable(struct net_device *dev, u16 qid)
struct virtnet_info *vi = netdev_priv(dev);
struct xsk_buff_pool *pool;
struct receive_queue *rq;
+ struct send_queue *sq;
int err;
if (qid >= vi->curr_queue_pairs)
return -EINVAL;
+ sq = &vi->sq[qid];
rq = &vi->rq[qid];
pool = rq->xsk_pool;
err = virtnet_rq_bind_xsk_pool(vi, rq, NULL);
+ err |= virtnet_sq_bind_xsk_pool(vi, sq, NULL);
xsk_pool_dma_unmap(pool, 0);
+ virtqueue_dma_unmap_single_attrs(sq->vq, sq->xsk_hdr_dma_addr,
+ vi->hdr_len, DMA_TO_DEVICE, 0);
kvfree(rq->xsk_buffs);
return err;
@@ -5977,7 +6204,7 @@ static void free_receive_page_frags(struct virtnet_info *vi)
int i;
for (i = 0; i < vi->max_queue_pairs; i++)
if (vi->rq[i].alloc_frag.page) {
- if (vi->rq[i].do_dma && vi->rq[i].last_dma)
+ if (vi->rq[i].last_dma)
virtnet_rq_unmap(&vi->rq[i], vi->rq[i].last_dma, 0);
put_page(vi->rq[i].alloc_frag.page);
}
@@ -5985,10 +6212,26 @@ static void free_receive_page_frags(struct virtnet_info *vi)
static void virtnet_sq_free_unused_buf(struct virtqueue *vq, void *buf)
{
- if (!is_xdp_frame(buf))
+ struct virtnet_info *vi = vq->vdev->priv;
+ struct send_queue *sq;
+ int i = vq2rxq(vq);
+
+ sq = &vi->sq[i];
+
+ switch (virtnet_xmit_ptr_unpack(&buf)) {
+ case VIRTNET_XMIT_TYPE_SKB:
+ case VIRTNET_XMIT_TYPE_SKB_ORPHAN:
dev_kfree_skb(buf);
- else
- xdp_return_frame(ptr_to_xdp(buf));
+ break;
+
+ case VIRTNET_XMIT_TYPE_XDP:
+ xdp_return_frame(buf);
+ break;
+
+ case VIRTNET_XMIT_TYPE_XSK:
+ xsk_tx_completed(sq->xsk_pool, 1);
+ break;
+ }
}
static void free_unused_bufs(struct virtnet_info *vi)
@@ -6436,7 +6679,8 @@ static int virtnet_probe(struct virtio_device *vdev)
dev->hw_features |= NETIF_F_GRO_HW;
dev->vlan_features = dev->features;
- dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT;
+ dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
+ NETDEV_XDP_ACT_XSK_ZEROCOPY;
/* MTU range: 68 - 65535 */
dev->min_mtu = MIN_MTU;