diff options
author | Alexander Lobakin <aleksander.lobakin@intel.com> | 2025-06-12 18:02:34 +0200 |
---|---|---|
committer | Tony Nguyen <anthony.l.nguyen@intel.com> | 2025-06-16 11:40:15 -0700 |
commit | 80bae9df2108cb72a060ee5235614d7c072af1de (patch) | |
tree | 57f41ffd77c44638e528d3d37eb9ed6204e1c141 | |
parent | 3ced71a8b39e84f91a4fa9d42e85815515f9b1bc (diff) |
libeth: xdp, xsk: access adjacent u32s as u64 where applicable
On 64-bit systems, writing/reading one u64 is faster than two u32s even
when they're are adjacent in a struct. The compilers won't guarantee
they will combine those; I observed both successful and unsuccessful
attempts with both GCC and Clang, and it's not easy to say what it
depends on.
There's a few places in libeth_xdp winning up to several percent from
combined access (both performance and object code size, especially
when unrolling). Add __LIBETH_WORD_ACCESS and use it there on LE.
Drivers are free to optimize HW-specific callbacks under the same
definition.
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
-rw-r--r-- | include/net/libeth/xdp.h | 29 | ||||
-rw-r--r-- | include/net/libeth/xsk.h | 10 |
2 files changed, 31 insertions, 8 deletions
diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index dba09a9168f1..6ce6aec6884c 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -475,6 +475,21 @@ struct libeth_xdp_tx_desc { ((const void *)(uintptr_t)(priv)); \ }) +/* + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. + */ +#ifdef __LITTLE_ENDIAN +#define __LIBETH_WORD_ACCESS 1 +#endif +#ifdef __LIBETH_WORD_ACCESS +#define __libeth_xdp_tx_len(flen, ...) \ + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) +#else +#define __libeth_xdp_tx_len(flen, ...) \ + .len = (flen), .flags = (__VA_ARGS__ + 0) +#endif + /** * libeth_xdp_tx_xmit_bulk - main XDP Tx function * @bulk: array of frames to send @@ -870,8 +885,7 @@ static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xdpf = xdpf, - .len = xdpf->len, - .flags = LIBETH_XDP_TX_FIRST, + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), }; if (!xdp_frame_has_frags(xdpf)) @@ -902,7 +916,7 @@ static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .dma = dma, - .len = skb_frag_size(frag), + __libeth_xdp_tx_len(skb_frag_size(frag)), }; return true; @@ -1260,6 +1274,7 @@ bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer * head with the Rx buffer data: data pointer, length, headroom, and * truesize/tailroom. Zeroes the flags. + * Uses faster single u64 write instead of per-field access. */ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, const struct libeth_fqe *fqe, @@ -1267,7 +1282,15 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, { const struct page *page = __netmem_to_page(fqe->netmem); +#ifdef __LIBETH_WORD_ACCESS + static_assert(offsetofend(typeof(xdp->base), flags) - + offsetof(typeof(xdp->base), frame_sz) == + sizeof(u64)); + + *(u64 *)&xdp->base.frame_sz = fqe->truesize; +#else xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); +#endif xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, page->pp->p.offset, len, true); } diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index 213778a68476..481a7b28e6f2 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -26,8 +26,8 @@ static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, { bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xsk = xdp, - .len = xdp->base.data_end - xdp->data, - .flags = LIBETH_XDP_TX_FIRST, + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, + LIBETH_XDP_TX_FIRST), }; if (likely(!xdp_buff_has_frags(&xdp->base))) @@ -48,7 +48,7 @@ static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, { bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xsk = frag, - .len = frag->base.data_end - frag->data, + __libeth_xdp_tx_len(frag->base.data_end - frag->data), }; } @@ -199,7 +199,7 @@ __libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); desc = (typeof(desc)){ .addr = ctx.dma, - .len = xdesc->len, + __libeth_xdp_tx_len(xdesc->len), }; BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); @@ -226,7 +226,7 @@ __libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, { return (struct libeth_xdp_tx_desc){ .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), - .len = xdesc->len, + __libeth_xdp_tx_len(xdesc->len), }; } |