diff options
Diffstat (limited to 'net/core/skbuff.c')
| -rw-r--r-- | net/core/skbuff.c | 387 | 
1 files changed, 228 insertions, 159 deletions
| diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 41ec02242ea7..b6a19ca0f99e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -347,94 +347,18 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)  }  EXPORT_SYMBOL(build_skb); -struct netdev_alloc_cache { -	struct page_frag	frag; -	/* we maintain a pagecount bias, so that we dont dirty cache line -	 * containing page->_count every time we allocate a fragment. -	 */ -	unsigned int		pagecnt_bias; -}; -static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); -static DEFINE_PER_CPU(struct netdev_alloc_cache, napi_alloc_cache); - -static struct page *__page_frag_refill(struct netdev_alloc_cache *nc, -				       gfp_t gfp_mask) -{ -	const unsigned int order = NETDEV_FRAG_PAGE_MAX_ORDER; -	struct page *page = NULL; -	gfp_t gfp = gfp_mask; - -	if (order) { -		gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | -			    __GFP_NOMEMALLOC; -		page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); -		nc->frag.size = PAGE_SIZE << (page ? order : 0); -	} - -	if (unlikely(!page)) -		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - -	nc->frag.page = page; - -	return page; -} - -static void *__alloc_page_frag(struct netdev_alloc_cache __percpu *cache, -			       unsigned int fragsz, gfp_t gfp_mask) -{ -	struct netdev_alloc_cache *nc = this_cpu_ptr(cache); -	struct page *page = nc->frag.page; -	unsigned int size; -	int offset; - -	if (unlikely(!page)) { -refill: -		page = __page_frag_refill(nc, gfp_mask); -		if (!page) -			return NULL; - -		/* if size can vary use frag.size else just use PAGE_SIZE */ -		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - -		/* Even if we own the page, we do not use atomic_set(). -		 * This would break get_page_unless_zero() users. -		 */ -		atomic_add(size - 1, &page->_count); - -		/* reset page count bias and offset to start of new frag */ -		nc->pagecnt_bias = size; -		nc->frag.offset = size; -	} - -	offset = nc->frag.offset - fragsz; -	if (unlikely(offset < 0)) { -		if (!atomic_sub_and_test(nc->pagecnt_bias, &page->_count)) -			goto refill; - -		/* if size can vary use frag.size else just use PAGE_SIZE */ -		size = NETDEV_FRAG_PAGE_MAX_ORDER ? nc->frag.size : PAGE_SIZE; - -		/* OK, page count is 0, we can safely set it */ -		atomic_set(&page->_count, size); - -		/* reset page count bias and offset to start of new frag */ -		nc->pagecnt_bias = size; -		offset = size - fragsz; -	} - -	nc->pagecnt_bias--; -	nc->frag.offset = offset; - -	return page_address(page) + offset; -} +static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); +static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);  static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)  { +	struct page_frag_cache *nc;  	unsigned long flags;  	void *data;  	local_irq_save(flags); -	data = __alloc_page_frag(&netdev_alloc_cache, fragsz, gfp_mask); +	nc = this_cpu_ptr(&netdev_alloc_cache); +	data = __alloc_page_frag(nc, fragsz, gfp_mask);  	local_irq_restore(flags);  	return data;  } @@ -454,7 +378,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);  static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)  { -	return __alloc_page_frag(&napi_alloc_cache, fragsz, gfp_mask); +	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache); + +	return __alloc_page_frag(nc, fragsz, gfp_mask);  }  void *napi_alloc_frag(unsigned int fragsz) @@ -464,76 +390,70 @@ void *napi_alloc_frag(unsigned int fragsz)  EXPORT_SYMBOL(napi_alloc_frag);  /** - *	__alloc_rx_skb - allocate an skbuff for rx + *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device + *	@dev: network device to receive on   *	@length: length to allocate   *	@gfp_mask: get_free_pages mask, passed to alloc_skb - *	@flags:	If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for - *		allocations in case we have to fallback to __alloc_skb() - *		If SKB_ALLOC_NAPI is set, page fragment will be allocated - *		from napi_cache instead of netdev_cache.   *   *	Allocate a new &sk_buff and assign it a usage count of one. The - *	buffer has unspecified headroom built in. Users should allocate + *	buffer has NET_SKB_PAD headroom built in. Users should allocate   *	the headroom they think they need without accounting for the   *	built in space. The built in space is used for optimisations.   *   *	%NULL is returned if there is no free memory.   */ -static struct sk_buff *__alloc_rx_skb(unsigned int length, gfp_t gfp_mask, -				      int flags) +struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, +				   gfp_t gfp_mask)  { -	struct sk_buff *skb = NULL; -	unsigned int fragsz = SKB_DATA_ALIGN(length) + -			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); +	struct page_frag_cache *nc; +	unsigned long flags; +	struct sk_buff *skb; +	bool pfmemalloc; +	void *data; -	if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { -		void *data; +	len += NET_SKB_PAD; -		if (sk_memalloc_socks()) -			gfp_mask |= __GFP_MEMALLOC; +	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || +	    (gfp_mask & (__GFP_WAIT | GFP_DMA))) { +		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); +		if (!skb) +			goto skb_fail; +		goto skb_success; +	} -		data = (flags & SKB_ALLOC_NAPI) ? -			__napi_alloc_frag(fragsz, gfp_mask) : -			__netdev_alloc_frag(fragsz, gfp_mask); +	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); +	len = SKB_DATA_ALIGN(len); -		if (likely(data)) { -			skb = build_skb(data, fragsz); -			if (unlikely(!skb)) -				put_page(virt_to_head_page(data)); -		} -	} else { -		skb = __alloc_skb(length, gfp_mask, -				  SKB_ALLOC_RX, NUMA_NO_NODE); -	} -	return skb; -} +	if (sk_memalloc_socks()) +		gfp_mask |= __GFP_MEMALLOC; -/** - *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device - *	@dev: network device to receive on - *	@length: length to allocate - *	@gfp_mask: get_free_pages mask, passed to alloc_skb - * - *	Allocate a new &sk_buff and assign it a usage count of one. The - *	buffer has NET_SKB_PAD headroom built in. Users should allocate - *	the headroom they think they need without accounting for the - *	built in space. The built in space is used for optimisations. - * - *	%NULL is returned if there is no free memory. - */ -struct sk_buff *__netdev_alloc_skb(struct net_device *dev, -				   unsigned int length, gfp_t gfp_mask) -{ -	struct sk_buff *skb; +	local_irq_save(flags); + +	nc = this_cpu_ptr(&netdev_alloc_cache); +	data = __alloc_page_frag(nc, len, gfp_mask); +	pfmemalloc = nc->pfmemalloc; -	length += NET_SKB_PAD; -	skb = __alloc_rx_skb(length, gfp_mask, 0); +	local_irq_restore(flags); -	if (likely(skb)) { -		skb_reserve(skb, NET_SKB_PAD); -		skb->dev = dev; +	if (unlikely(!data)) +		return NULL; + +	skb = __build_skb(data, len); +	if (unlikely(!skb)) { +		skb_free_frag(data); +		return NULL;  	} +	/* use OR instead of assignment to avoid clearing of bits in mask */ +	if (pfmemalloc) +		skb->pfmemalloc = 1; +	skb->head_frag = 1; + +skb_success: +	skb_reserve(skb, NET_SKB_PAD); +	skb->dev = dev; + +skb_fail:  	return skb;  }  EXPORT_SYMBOL(__netdev_alloc_skb); @@ -551,19 +471,49 @@ EXPORT_SYMBOL(__netdev_alloc_skb);   *   *	%NULL is returned if there is no free memory.   */ -struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, -				 unsigned int length, gfp_t gfp_mask) +struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, +				 gfp_t gfp_mask)  { +	struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);  	struct sk_buff *skb; +	void *data; -	length += NET_SKB_PAD + NET_IP_ALIGN; -	skb = __alloc_rx_skb(length, gfp_mask, SKB_ALLOC_NAPI); +	len += NET_SKB_PAD + NET_IP_ALIGN; -	if (likely(skb)) { -		skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); -		skb->dev = napi->dev; +	if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || +	    (gfp_mask & (__GFP_WAIT | GFP_DMA))) { +		skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); +		if (!skb) +			goto skb_fail; +		goto skb_success;  	} +	len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); +	len = SKB_DATA_ALIGN(len); + +	if (sk_memalloc_socks()) +		gfp_mask |= __GFP_MEMALLOC; + +	data = __alloc_page_frag(nc, len, gfp_mask); +	if (unlikely(!data)) +		return NULL; + +	skb = __build_skb(data, len); +	if (unlikely(!skb)) { +		skb_free_frag(data); +		return NULL; +	} + +	/* use OR instead of assignment to avoid clearing of bits in mask */ +	if (nc->pfmemalloc) +		skb->pfmemalloc = 1; +	skb->head_frag = 1; + +skb_success: +	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); +	skb->dev = napi->dev; + +skb_fail:  	return skb;  }  EXPORT_SYMBOL(__napi_alloc_skb); @@ -611,10 +561,12 @@ static void skb_clone_fraglist(struct sk_buff *skb)  static void skb_free_head(struct sk_buff *skb)  { +	unsigned char *head = skb->head; +  	if (skb->head_frag) -		put_page(virt_to_head_page(skb->head)); +		skb_free_frag(head);  	else -		kfree(skb->head); +		kfree(head);  }  static void skb_release_data(struct sk_buff *skb) @@ -1918,15 +1870,39 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,  	return false;  } +ssize_t skb_socket_splice(struct sock *sk, +			  struct pipe_inode_info *pipe, +			  struct splice_pipe_desc *spd) +{ +	int ret; + +	/* Drop the socket lock, otherwise we have reverse +	 * locking dependencies between sk_lock and i_mutex +	 * here as compared to sendfile(). We enter here +	 * with the socket lock held, and splice_to_pipe() will +	 * grab the pipe inode lock. For sendfile() emulation, +	 * we call into ->sendpage() with the i_mutex lock held +	 * and networking will grab the socket lock. +	 */ +	release_sock(sk); +	ret = splice_to_pipe(pipe, spd); +	lock_sock(sk); + +	return ret; +} +  /*   * Map data from the skb to a pipe. Should handle both the linear part,   * the fragments, and the frag list. It does NOT handle frag lists within   * the frag list, if such a thing exists. We'd probably need to recurse to   * handle that cleanly.   */ -int skb_splice_bits(struct sk_buff *skb, unsigned int offset, +int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,  		    struct pipe_inode_info *pipe, unsigned int tlen, -		    unsigned int flags) +		    unsigned int flags, +		    ssize_t (*splice_cb)(struct sock *, +					 struct pipe_inode_info *, +					 struct splice_pipe_desc *))  {  	struct partial_page partial[MAX_SKB_FRAGS];  	struct page *pages[MAX_SKB_FRAGS]; @@ -1939,7 +1915,6 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,  		.spd_release = sock_spd_release,  	};  	struct sk_buff *frag_iter; -	struct sock *sk = skb->sk;  	int ret = 0;  	/* @@ -1962,23 +1937,12 @@ int skb_splice_bits(struct sk_buff *skb, unsigned int offset,  	}  done: -	if (spd.nr_pages) { -		/* -		 * Drop the socket lock, otherwise we have reverse -		 * locking dependencies between sk_lock and i_mutex -		 * here as compared to sendfile(). We enter here -		 * with the socket lock held, and splice_to_pipe() will -		 * grab the pipe inode lock. For sendfile() emulation, -		 * we call into ->sendpage() with the i_mutex lock held -		 * and networking will grab the socket lock. -		 */ -		release_sock(sk); -		ret = splice_to_pipe(pipe, &spd); -		lock_sock(sk); -	} +	if (spd.nr_pages) +		ret = splice_cb(sk, pipe, &spd);  	return ret;  } +EXPORT_SYMBOL_GPL(skb_splice_bits);  /**   *	skb_store_bits - store bits from kernel buffer to skb @@ -2963,6 +2927,24 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,  }  EXPORT_SYMBOL(skb_append_datato_frags); +int skb_append_pagefrags(struct sk_buff *skb, struct page *page, +			 int offset, size_t size) +{ +	int i = skb_shinfo(skb)->nr_frags; + +	if (skb_can_coalesce(skb, i, page, offset)) { +		skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size); +	} else if (i < MAX_SKB_FRAGS) { +		get_page(page); +		skb_fill_page_desc(skb, i, page, offset, size); +	} else { +		return -EMSGSIZE; +	} + +	return 0; +} +EXPORT_SYMBOL_GPL(skb_append_pagefrags); +  /**   *	skb_pull_rcsum - pull skb and update receive checksum   *	@skb: buffer to update @@ -4030,6 +4012,93 @@ int skb_checksum_setup(struct sk_buff *skb, bool recalculate)  }  EXPORT_SYMBOL(skb_checksum_setup); +/** + * skb_checksum_maybe_trim - maybe trims the given skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * + * Checks whether the given skb has data beyond the given transport length. + * If so, returns a cloned skb trimmed to this transport length. + * Otherwise returns the provided skb. Returns NULL in error cases + * (e.g. transport_len exceeds skb length or out-of-memory). + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +static struct sk_buff *skb_checksum_maybe_trim(struct sk_buff *skb, +					       unsigned int transport_len) +{ +	struct sk_buff *skb_chk; +	unsigned int len = skb_transport_offset(skb) + transport_len; +	int ret; + +	if (skb->len < len) { +		kfree_skb(skb); +		return NULL; +	} else if (skb->len == len) { +		return skb; +	} + +	skb_chk = skb_clone(skb, GFP_ATOMIC); +	kfree_skb(skb); + +	if (!skb_chk) +		return NULL; + +	ret = pskb_trim_rcsum(skb_chk, len); +	if (ret) { +		kfree_skb(skb_chk); +		return NULL; +	} + +	return skb_chk; +} + +/** + * skb_checksum_trimmed - validate checksum of an skb + * @skb: the skb to check + * @transport_len: the data length beyond the network header + * @skb_chkf: checksum function to use + * + * Applies the given checksum function skb_chkf to the provided skb. + * Returns a checked and maybe trimmed skb. Returns NULL on error. + * + * If the skb has data beyond the given transport length, then a + * trimmed & cloned skb is checked and returned. + * + * Caller needs to set the skb transport header and release the returned skb. + * Provided skb is consumed. + */ +struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb, +				     unsigned int transport_len, +				     __sum16(*skb_chkf)(struct sk_buff *skb)) +{ +	struct sk_buff *skb_chk; +	unsigned int offset = skb_transport_offset(skb); +	__sum16 ret; + +	skb_chk = skb_checksum_maybe_trim(skb, transport_len); +	if (!skb_chk) +		return NULL; + +	if (!pskb_may_pull(skb_chk, offset)) { +		kfree_skb(skb_chk); +		return NULL; +	} + +	__skb_pull(skb_chk, offset); +	ret = skb_chkf(skb_chk); +	__skb_push(skb_chk, offset); + +	if (ret) { +		kfree_skb(skb_chk); +		return NULL; +	} + +	return skb_chk; +} +EXPORT_SYMBOL(skb_checksum_trimmed); +  void __skb_warn_lro_forwarding(const struct sk_buff *skb)  {  	net_warn_ratelimited("%s: received packets cannot be forwarded while LRO is enabled\n", | 
