diff options
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r-- | net/core/skbuff.c | 228 |
1 files changed, 153 insertions, 75 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a298992060e6..4eaf7ed0d1f4 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -73,7 +73,7 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> -#include <net/page_pool.h> +#include <net/page_pool/helpers.h> #include <net/dropreason.h> #include <linux/uaccess.h> @@ -550,7 +550,7 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, bool *pfmemalloc) { bool ret_pfmemalloc = false; - unsigned int obj_size; + size_t obj_size; void *obj; obj_size = SKB_HEAD_ALIGN(*size); @@ -567,7 +567,13 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj = kmem_cache_alloc_node(skb_small_head_cache, flags, node); goto out; } - *size = obj_size = kmalloc_size_roundup(obj_size); + + obj_size = kmalloc_size_roundup(obj_size); + /* The following cast might truncate high-order bits of obj_size, this + * is harmless because kmalloc(obj_size >= 2^32) will fail anyway. + */ + *size = (unsigned int)obj_size; + /* * Try a regular allocation, when that fails and we're not entitled * to the reserves, fail. @@ -879,11 +885,56 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +#if IS_ENABLED(CONFIG_PAGE_POOL) +bool napi_pp_put_page(struct page *page, bool napi_safe) +{ + bool allow_direct = false; + struct page_pool *pp; + + page = compound_head(page); + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) + return false; + + pp = page->pp; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + if (napi_safe || in_softirq()) { + const struct napi_struct *napi = READ_ONCE(pp->p.napi); + + allow_direct = napi && + READ_ONCE(napi->list_owner) == smp_processor_id(); + } + + /* Driver set this to memory recycling info. Reset it on recycle. + * This will *not* work for NIC using a split-page memory model. + * The page will be returned to the pool here regardless of the + * 'flipped' fragment being in use or not. + */ + page_pool_put_full_page(pp, page, allow_direct); + + return true; +} +EXPORT_SYMBOL(napi_pp_put_page); +#endif + static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe) { if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) return false; - return page_pool_return_skb_page(virt_to_page(data), napi_safe); + return napi_pp_put_page(virt_to_page(data), napi_safe); } static void skb_kfree_head(void *head, unsigned int end_offset) @@ -3656,20 +3707,23 @@ struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) EXPORT_SYMBOL(skb_dequeue_tail); /** - * skb_queue_purge - empty a list + * skb_queue_purge_reason - empty a list * @list: list to empty + * @reason: drop reason * * Delete all buffers on an &sk_buff list. Each buffer is removed from * the list and one reference dropped. This function takes the list * lock and is atomic with respect to other list locking functions. */ -void skb_queue_purge(struct sk_buff_head *list) +void skb_queue_purge_reason(struct sk_buff_head *list, + enum skb_drop_reason reason) { struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) - kfree_skb(skb); + kfree_skb_reason(skb, reason); } -EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_purge_reason); /** * skb_rbtree_purge - empty a skb rbtree @@ -3697,6 +3751,27 @@ unsigned int skb_rbtree_purge(struct rb_root *root) return sum; } +void skb_errqueue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb, *next; + struct sk_buff_head kill; + unsigned long flags; + + __skb_queue_head_init(&kill); + + spin_lock_irqsave(&list->lock, flags); + skb_queue_walk_safe(list, skb, next) { + if (SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_ZEROCOPY || + SKB_EXT_ERR(skb)->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) + continue; + __skb_unlink(skb, list); + __skb_queue_tail(&kill, skb); + } + spin_unlock_irqrestore(&list->lock, flags); + __skb_queue_purge(&kill); +} +EXPORT_SYMBOL(skb_errqueue_purge); + /** * skb_queue_head - queue a buffer at the list head * @list: list to use @@ -4354,21 +4429,20 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, struct sk_buff *segs = NULL; struct sk_buff *tail = NULL; struct sk_buff *list_skb = skb_shinfo(head_skb)->frag_list; - skb_frag_t *frag = skb_shinfo(head_skb)->frags; unsigned int mss = skb_shinfo(head_skb)->gso_size; unsigned int doffset = head_skb->data - skb_mac_header(head_skb); - struct sk_buff *frag_skb = head_skb; unsigned int offset = doffset; unsigned int tnl_hlen = skb_tnl_header_len(head_skb); unsigned int partial_segs = 0; unsigned int headroom; unsigned int len = head_skb->len; + struct sk_buff *frag_skb; + skb_frag_t *frag; __be16 proto; bool csum, sg; - int nfrags = skb_shinfo(head_skb)->nr_frags; int err = -ENOMEM; int i = 0; - int pos; + int nfrags, pos; if ((skb_shinfo(head_skb)->gso_type & SKB_GSO_DODGY) && mss != GSO_BY_FRAGS && mss != skb_headlen(head_skb)) { @@ -4445,6 +4519,13 @@ normal: headroom = skb_headroom(head_skb); pos = skb_headlen(head_skb); + if (skb_orphan_frags(head_skb, GFP_ATOMIC)) + return ERR_PTR(-ENOMEM); + + nfrags = skb_shinfo(head_skb)->nr_frags; + frag = skb_shinfo(head_skb)->frags; + frag_skb = head_skb; + do { struct sk_buff *nskb; skb_frag_t *nskb_frag; @@ -4465,6 +4546,10 @@ normal: (skb_headlen(list_skb) == len || sg)) { BUG_ON(skb_headlen(list_skb) > len); + nskb = skb_clone(list_skb, GFP_ATOMIC); + if (unlikely(!nskb)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -4483,12 +4568,8 @@ normal: frag++; } - nskb = skb_clone(list_skb, GFP_ATOMIC); list_skb = list_skb->next; - if (unlikely(!nskb)) - goto err; - if (unlikely(pskb_trim(nskb, len))) { kfree_skb(nskb); goto err; @@ -4564,12 +4645,16 @@ normal: skb_shinfo(nskb)->flags |= skb_shinfo(head_skb)->flags & SKBFL_SHARED_FRAG; - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) goto err; while (pos < offset + len) { if (i >= nfrags) { + if (skb_orphan_frags(list_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, list_skb, + GFP_ATOMIC)) + goto err; + i = 0; nfrags = skb_shinfo(list_skb)->nr_frags; frag = skb_shinfo(list_skb)->frags; @@ -4583,10 +4668,6 @@ normal: i--; frag--; } - if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || - skb_zerocopy_clone(nskb, frag_skb, - GFP_ATOMIC)) - goto err; list_skb = list_skb->next; } @@ -4716,23 +4797,13 @@ static const u8 skb_ext_type_len[] = { static __always_inline unsigned int skb_ext_total_length(void) { - return SKB_EXT_CHUNKSIZEOF(struct skb_ext) + -#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) - skb_ext_type_len[SKB_EXT_BRIDGE_NF] + -#endif -#ifdef CONFIG_XFRM - skb_ext_type_len[SKB_EXT_SEC_PATH] + -#endif -#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) - skb_ext_type_len[TC_SKB_EXT] + -#endif -#if IS_ENABLED(CONFIG_MPTCP) - skb_ext_type_len[SKB_EXT_MPTCP] + -#endif -#if IS_ENABLED(CONFIG_MCTP_FLOWS) - skb_ext_type_len[SKB_EXT_MCTP] + -#endif - 0; + unsigned int l = SKB_EXT_CHUNKSIZEOF(struct skb_ext); + int i; + + for (i = 0; i < ARRAY_SIZE(skb_ext_type_len); i++) + l += skb_ext_type_len[i]; + + return l; } static void skb_extensions_init(void) @@ -4750,12 +4821,23 @@ static void skb_extensions_init(void) static void skb_extensions_init(void) {} #endif +/* The SKB kmem_cache slab is critical for network performance. Never + * merge/alias the slab with similar sized objects. This avoids fragmentation + * that hurts performance of kmem_cache_{alloc,free}_bulk APIs. + */ +#ifndef CONFIG_SLUB_TINY +#define FLAG_SKB_NO_MERGE SLAB_NO_MERGE +#else /* CONFIG_SLUB_TINY - simple loop in kmem_cache_alloc_bulk */ +#define FLAG_SKB_NO_MERGE 0 +#endif + void __init skb_init(void) { skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache", sizeof(struct sk_buff), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, + SLAB_HWCACHE_ALIGN|SLAB_PANIC| + FLAG_SKB_NO_MERGE, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); @@ -5137,7 +5219,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; - if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { + if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk_is_tcp(sk)) serr->ee.ee_data -= atomic_read(&sk->sk_tskey); @@ -5193,21 +5275,23 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb, { struct sk_buff *skb; bool tsonly, opt_stats = false; + u32 tsflags; if (!sk) return; - if (!hwtstamps && !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && + tsflags = READ_ONCE(sk->sk_tsflags); + if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) && skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS) return; - tsonly = sk->sk_tsflags & SOF_TIMESTAMPING_OPT_TSONLY; + tsonly = tsflags & SOF_TIMESTAMPING_OPT_TSONLY; if (!skb_may_tx_timestamp(sk, tsonly)) return; if (tsonly) { #ifdef CONFIG_INET - if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_STATS) && + if ((tsflags & SOF_TIMESTAMPING_OPT_STATS) && sk_is_tcp(sk)) { skb = tcp_get_timestamping_opt_stats(sk, orig_skb, ack_skb); @@ -6204,7 +6288,7 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); * * @header_len: size of linear part * @data_len: needed length in frags - * @max_page_order: max page order desired. + * @order: max page order desired. * @errcode: pointer to error code if any * @gfp_mask: allocation mask * @@ -6212,21 +6296,17 @@ EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl); */ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, unsigned long data_len, - int max_page_order, + int order, int *errcode, gfp_t gfp_mask) { - int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; unsigned long chunk; struct sk_buff *skb; struct page *page; - int i; + int nr_frags = 0; *errcode = -EMSGSIZE; - /* Note this test could be relaxed, if we succeed to allocate - * high order pages... - */ - if (npages > MAX_SKB_FRAGS) + if (unlikely(data_len > MAX_SKB_FRAGS * (PAGE_SIZE << order))) return NULL; *errcode = -ENOBUFS; @@ -6234,34 +6314,32 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, if (!skb) return NULL; - skb->truesize += npages << PAGE_SHIFT; - - for (i = 0; npages > 0; i++) { - int order = max_page_order; - - while (order) { - if (npages >= 1 << order) { - page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | - __GFP_COMP | - __GFP_NOWARN, - order); - if (page) - goto fill_page; - /* Do not retry other high order allocations */ - order = 1; - max_page_order = 0; - } + while (data_len) { + if (nr_frags == MAX_SKB_FRAGS - 1) + goto failure; + while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; + + if (order) { + page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) | + __GFP_COMP | + __GFP_NOWARN, + order); + if (!page) { + order--; + continue; + } + } else { + page = alloc_page(gfp_mask); + if (!page) + goto failure; } - page = alloc_page(gfp_mask); - if (!page) - goto failure; -fill_page: chunk = min_t(unsigned long, data_len, PAGE_SIZE << order); - skb_fill_page_desc(skb, i, page, 0, chunk); + skb_fill_page_desc(skb, nr_frags, page, 0, chunk); + nr_frags++; + skb->truesize += (PAGE_SIZE << order); data_len -= chunk; - npages -= 1 << order; } return skb; |