summaryrefslogtreecommitdiff
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c228
1 files changed, 153 insertions, 75 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a298992060e6..4eaf7ed0d1f4 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -73,7 +73,7 @@
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
-#include <net/page_pool.h>
+#include <net/page_pool/helpers.h>
#include <net/dropreason.h>
#include <linux/uaccess.h>
@@ -550,7 +550,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
bool *pfmemalloc)
{
bool ret_pfmemalloc = false;
- unsigned int obj_size;
+ size_t obj_size;
void *obj;
obj_size = SKB_HEAD_ALIGN(*size);
@@ -567,7 +567,13 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node);
goto out;
}
- *size = obj_size = kmalloc_size_roundup(obj_size);
+
+ obj_size = kmalloc_size_roundup(obj_size);
+ /* The following cast might truncate high-order bits of obj_size, this
+ * is harmless because kmalloc(obj_size >= 2^32) will fail anyway.
+ */
+ *size = (unsigned int)obj_size;
+
/*
* Try a regular allocation, when that fails and we're not entitled
* to the reserves, fail.
@@ -879,11 +885,56 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+bool napi_pp_put_page(struct page *page, bool napi_safe)
+{
+ bool allow_direct = false;
+ struct page_pool *pp;
+
+ page = compound_head(page);
+
+ /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
+ * in order to preserve any existing bits, such as bit 0 for the
+ * head page of compound page and bit 1 for pfmemalloc page, so
+ * mask those bits for freeing side when doing below checking,
+ * and page_is_pfmemalloc() is checked in __page_pool_put_page()
+ * to avoid recycling the pfmemalloc page.
+ */
+ if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE))
+ return false;
+
+ pp = page->pp;
+
+ /* Allow direct recycle if we have reasons to believe that we are
+ * in the same context as the consumer would run, so there's
+ * no possible race.
+ * __page_pool_put_page() makes sure we're not in hardirq context
+ * and interrupts are enabled prior to accessing the cache.
+ */
+ if (napi_safe || in_softirq()) {
+ const struct napi_struct *napi = READ_ONCE(pp->p.napi);
+
+ allow_direct = napi &&
+ READ_ONCE(napi->list_owner) == smp_processor_id();
+ }
+
+ /* Driver set this to memory recycling info. Reset it on recycle.
+ * This will *not* work for NIC using a split-page memory model.
+ * The page will be returned to the pool here regardless of the
+ * 'flipped' fragment being in use or not.
+ */
+ page_pool_put_full_page(pp, page, allow_direct);
+
+ return true;
+}
+EXPORT_SYMBOL(napi_pp_put_page);
+#endif
+
static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return page_pool_return_skb_page(virt_to_page(data), napi_safe);
+ return napi_pp_put_page(virt_to_page(data), napi_safe);
}
static void skb_kfree_head(void *head, unsigned int end_offset)
@@ -3656,20 +3707,23 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list)
EXPORT_SYMBOL(skb_dequeue_tail);
/**
- * skb_queue_purge - empty a list
+ * skb_queue_purge_reason - empty a list
* @list: list to empty
+ * @reason: drop reason
*
* Delete all buffers on an &sk_buff list. Each buffer is removed from
* the list and one reference dropped. This function takes the list
* lock and is atomic with respect to other list locking functions.
*/
-void skb_queue_purge(struct sk_buff_head *list)
+void skb_queue_purge_reason(struct sk_buff_head *list,
+ enum skb_drop_reason reason)
{
struct sk_buff *skb;
+
while ((skb = skb_dequeue(list)) != NULL)
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
-EXPORT_SYMBOL(skb_queue_purge);
+EXPORT_SYMBOL(skb_queue_purge_reason);
/**
* skb_rbtree_purge - empty a skb rbtree
@@ -3697,6 +3751,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root)
return sum;
}
+void skb_errqueue_purge(struct sk_buff_head *list)
+{
+ struct sk_buff *skb, *next;
+ struct sk_buff_head kill;
+ unsigned long flags;
+
+ __skb_queue_head_init(&kill);
+
+ spin_lock_irqsave(&list->lock, flags);
+ skb_queue_walk_safe(list, skb, next) {
+ if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY ||
+ SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING)
+ continue;
+ __skb_unlink(skb, list);
+ __skb_queue_tail(&kill, skb);
+ }
+ spin_unlock_irqrestore(&list->lock, flags);
+ __skb_queue_purge(&kill);
+}
+EXPORT_SYMBOL(skb_errqueue_purge);
+
/**
* skb_queue_head - queue a buffer at the list head
* @list: list to use
@@ -4354,21 +4429,20 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list;
- skb_frag_t *frag = skb_shinfo(head_skb)->frags;
unsigned int mss = skb_shinfo(head_skb)->gso_size;
unsigned int doffset = head_skb->data - skb_mac_header(head_skb);
- struct sk_buff *frag_skb = head_skb;
unsigned int offset = doffset;
unsigned int tnl_hlen = skb_tnl_header_len(head_skb);
unsigned int partial_segs = 0;
unsigned int headroom;
unsigned int len = head_skb->len;
+ struct sk_buff *frag_skb;
+ skb_frag_t *frag;
__be16 proto;
bool csum, sg;
- int nfrags = skb_shinfo(head_skb)->nr_frags;
int err = -ENOMEM;
int i = 0;
- int pos;
+ int nfrags, pos;
if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) &&
mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) {
@@ -4445,6 +4519,13 @@ normal:
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
+ if (skb_orphan_frags(head_skb, GFP_ATOMIC))
+ return ERR_PTR(-ENOMEM);
+
+ nfrags = skb_shinfo(head_skb)->nr_frags;
+ frag = skb_shinfo(head_skb)->frags;
+ frag_skb = head_skb;
+
do {
struct sk_buff *nskb;
skb_frag_t *nskb_frag;
@@ -4465,6 +4546,10 @@ normal:
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
+ nskb = skb_clone(list_skb, GFP_ATOMIC);
+ if (unlikely(!nskb))
+ goto err;
+
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
@@ -4483,12 +4568,8 @@ normal:
frag++;
}
- nskb = skb_clone(list_skb, GFP_ATOMIC);
list_skb = list_skb->next;
- if (unlikely(!nskb))
- goto err;
-
if (unlikely(pskb_trim(nskb, len))) {
kfree_skb(nskb);
goto err;
@@ -4564,12 +4645,16 @@ normal:
skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags &
SKBFL_SHARED_FRAG;
- if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
- skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
+ if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
goto err;
while (pos < offset + len) {
if (i >= nfrags) {
+ if (skb_orphan_frags(list_skb, GFP_ATOMIC) ||
+ skb_zerocopy_clone(nskb, list_skb,
+ GFP_ATOMIC))
+ goto err;
+
i = 0;
nfrags = skb_shinfo(list_skb)->nr_frags;
frag = skb_shinfo(list_skb)->frags;
@@ -4583,10 +4668,6 @@ normal:
i--;
frag--;
}
- if (skb_orphan_frags(frag_skb, GFP_ATOMIC) ||
- skb_zerocopy_clone(nskb, frag_skb,
- GFP_ATOMIC))
- goto err;
list_skb = list_skb->next;
}
@@ -4716,23 +4797,13 @@ static const u8 skb_ext_type_len[] = {
static __always_inline unsigned int skb_ext_total_length(void)
{
- return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
-#endif
-#ifdef CONFIG_XFRM
- skb_ext_type_len[SKB_EXT_SEC_PATH] +
-#endif
-#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT)
- skb_ext_type_len[TC_SKB_EXT] +
-#endif
-#if IS_ENABLED(CONFIG_MPTCP)
- skb_ext_type_len[SKB_EXT_MPTCP] +
-#endif
-#if IS_ENABLED(CONFIG_MCTP_FLOWS)
- skb_ext_type_len[SKB_EXT_MCTP] +
-#endif
- 0;
+ unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++)
+ l += skb_ext_type_len[i];
+
+ return l;
}
static void skb_extensions_init(void)
@@ -4750,12 +4821,23 @@ static void skb_extensions_init(void)
static void skb_extensions_init(void) {}
#endif
+/* The SKB kmem_cache slab is critical for network performance. Never
+ * merge/alias the slab with similar sized objects. This avoids fragmentation
+ * that hurts performance of kmem_cache_{alloc,free}_bulk APIs.
+ */
+#ifndef CONFIG_SLUB_TINY
+#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE
+#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */
+#define FLAG_SKB_NO_MERGE 0
+#endif
+
void __init skb_init(void)
{
skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC|
+ FLAG_SKB_NO_MERGE,
offsetof(struct sk_buff, cb),
sizeof_field(struct sk_buff, cb),
NULL);
@@ -5137,7 +5219,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->ee.ee_info = tstype;
serr->opt_stats = opt_stats;
serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0;
- if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
+ if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
if (sk_is_tcp(sk))
serr->ee.ee_data -= atomic_read(&sk->sk_tskey);
@@ -5193,21 +5275,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
{
struct sk_buff *skb;
bool tsonly, opt_stats = false;
+ u32 tsflags;
if (!sk)
return;
- if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
+ tsflags = READ_ONCE(sk->sk_tsflags);
+ if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
return;
- tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
+ tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY;
if (!skb_may_tx_timestamp(sk, tsonly))
return;
if (tsonly) {
#ifdef CONFIG_INET
- if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
+ if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) &&
sk_is_tcp(sk)) {
skb = tcp_get_timestamping_opt_stats(sk, orig_skb,
ack_skb);
@@ -6204,7 +6288,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
*
* @header_len: size of linear part
* @data_len: needed length in frags
- * @max_page_order: max page order desired.
+ * @order: max page order desired.
* @errcode: pointer to error code if any
* @gfp_mask: allocation mask
*
@@ -6212,21 +6296,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
*/
struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
unsigned long data_len,
- int max_page_order,
+ int order,
int *errcode,
gfp_t gfp_mask)
{
- int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
unsigned long chunk;
struct sk_buff *skb;
struct page *page;
- int i;
+ int nr_frags = 0;
*errcode = -EMSGSIZE;
- /* Note this test could be relaxed, if we succeed to allocate
- * high order pages...
- */
- if (npages > MAX_SKB_FRAGS)
+ if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order)))
return NULL;
*errcode = -ENOBUFS;
@@ -6234,34 +6314,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
if (!skb)
return NULL;
- skb->truesize += npages << PAGE_SHIFT;
-
- for (i = 0; npages > 0; i++) {
- int order = max_page_order;
-
- while (order) {
- if (npages >= 1 << order) {
- page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
- __GFP_COMP |
- __GFP_NOWARN,
- order);
- if (page)
- goto fill_page;
- /* Do not retry other high order allocations */
- order = 1;
- max_page_order = 0;
- }
+ while (data_len) {
+ if (nr_frags == MAX_SKB_FRAGS - 1)
+ goto failure;
+ while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
order--;
+
+ if (order) {
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP |
+ __GFP_NOWARN,
+ order);
+ if (!page) {
+ order--;
+ continue;
+ }
+ } else {
+ page = alloc_page(gfp_mask);
+ if (!page)
+ goto failure;
}
- page = alloc_page(gfp_mask);
- if (!page)
- goto failure;
-fill_page:
chunk = min_t(unsigned long, data_len,
PAGE_SIZE << order);
- skb_fill_page_desc(skb, i, page, 0, chunk);
+ skb_fill_page_desc(skb, nr_frags, page, 0, chunk);
+ nr_frags++;
+ skb->truesize += (PAGE_SIZE << order);
data_len -= chunk;
- npages -= 1 << order;
}
return skb;