diff options
Diffstat (limited to 'net/core/page_pool.c')
| -rw-r--r-- | net/core/page_pool.c | 277 | 
1 files changed, 206 insertions, 71 deletions
| diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 3272dc7a8c81..9b7cbe35df37 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -18,6 +18,9 @@  #include <trace/events/page_pool.h> +#define DEFER_TIME (msecs_to_jiffies(1000)) +#define DEFER_WARN_INTERVAL (60 * HZ) +  static int page_pool_init(struct page_pool *pool,  			  const struct page_pool_params *params)  { @@ -44,6 +47,21 @@ static int page_pool_init(struct page_pool *pool,  	    (pool->p.dma_dir != DMA_BIDIRECTIONAL))  		return -EINVAL; +	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { +		/* In order to request DMA-sync-for-device the page +		 * needs to be mapped +		 */ +		if (!(pool->p.flags & PP_FLAG_DMA_MAP)) +			return -EINVAL; + +		if (!pool->p.max_len) +			return -EINVAL; + +		/* pool->p.offset has to be set according to the address +		 * offset used by the DMA engine to start copying rx data +		 */ +	} +  	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)  		return -ENOMEM; @@ -61,7 +79,7 @@ static int page_pool_init(struct page_pool *pool,  struct page_pool *page_pool_create(const struct page_pool_params *params)  {  	struct page_pool *pool; -	int err = 0; +	int err;  	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);  	if (!pool) @@ -78,16 +96,68 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)  }  EXPORT_SYMBOL(page_pool_create); -/* fast path */ -static struct page *__page_pool_get_cached(struct page_pool *pool) +static void __page_pool_return_page(struct page_pool *pool, struct page *page); + +noinline +static struct page *page_pool_refill_alloc_cache(struct page_pool *pool, +						 bool refill)  {  	struct ptr_ring *r = &pool->ring;  	struct page *page; +	int pref_nid; /* preferred NUMA node */  	/* Quicker fallback, avoid locks when ring is empty */  	if (__ptr_ring_empty(r))  		return NULL; +	/* Softirq guarantee CPU and thus NUMA node is stable. This, +	 * assumes CPU refilling driver RX-ring will also run RX-NAPI. +	 */ +#ifdef CONFIG_NUMA +	pref_nid = (pool->p.nid == NUMA_NO_NODE) ? numa_mem_id() : pool->p.nid; +#else +	/* Ignore pool->p.nid setting if !CONFIG_NUMA, helps compiler */ +	pref_nid = numa_mem_id(); /* will be zero like page_to_nid() */ +#endif + +	/* Slower-path: Get pages from locked ring queue */ +	spin_lock(&r->consumer_lock); + +	/* Refill alloc array, but only if NUMA match */ +	do { +		page = __ptr_ring_consume(r); +		if (unlikely(!page)) +			break; + +		if (likely(page_to_nid(page) == pref_nid)) { +			pool->alloc.cache[pool->alloc.count++] = page; +		} else { +			/* NUMA mismatch; +			 * (1) release 1 page to page-allocator and +			 * (2) break out to fallthrough to alloc_pages_node. +			 * This limit stress on page buddy alloactor. +			 */ +			__page_pool_return_page(pool, page); +			page = NULL; +			break; +		} +	} while (pool->alloc.count < PP_ALLOC_CACHE_REFILL && +		 refill); + +	/* Return last page */ +	if (likely(pool->alloc.count > 0)) +		page = pool->alloc.cache[--pool->alloc.count]; + +	spin_unlock(&r->consumer_lock); +	return page; +} + +/* fast path */ +static struct page *__page_pool_get_cached(struct page_pool *pool) +{ +	bool refill = false; +	struct page *page; +  	/* Test for safe-context, caller should provide this guarantee */  	if (likely(in_serving_softirq())) {  		if (likely(pool->alloc.count)) { @@ -95,30 +165,23 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)  			page = pool->alloc.cache[--pool->alloc.count];  			return page;  		} -		/* Slower-path: Alloc array empty, time to refill -		 * -		 * Open-coded bulk ptr_ring consumer. -		 * -		 * Discussion: the ring consumer lock is not really -		 * needed due to the softirq/NAPI protection, but -		 * later need the ability to reclaim pages on the -		 * ring. Thus, keeping the locks. -		 */ -		spin_lock(&r->consumer_lock); -		while ((page = __ptr_ring_consume(r))) { -			if (pool->alloc.count == PP_ALLOC_CACHE_REFILL) -				break; -			pool->alloc.cache[pool->alloc.count++] = page; -		} -		spin_unlock(&r->consumer_lock); -		return page; +		refill = true;  	} -	/* Slow-path: Get page from locked ring queue */ -	page = ptr_ring_consume(&pool->ring); +	page = page_pool_refill_alloc_cache(pool, refill);  	return page;  } +static void page_pool_dma_sync_for_device(struct page_pool *pool, +					  struct page *page, +					  unsigned int dma_sync_size) +{ +	dma_sync_size = min(dma_sync_size, pool->p.max_len); +	dma_sync_single_range_for_device(pool->p.dev, page->dma_addr, +					 pool->p.offset, dma_sync_size, +					 pool->p.dma_dir); +} +  /* slow path */  noinline  static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, @@ -142,7 +205,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,  	 */  	/* Cache was empty, do real allocation */ +#ifdef CONFIG_NUMA  	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order); +#else +	page = alloc_pages(gfp, pool->p.order); +#endif  	if (!page)  		return NULL; @@ -163,6 +230,9 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,  	}  	page->dma_addr = dma; +	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) +		page_pool_dma_sync_for_device(pool, page, pool->p.max_len); +  skip_dma_map:  	/* Track how many pages are held 'in-flight' */  	pool->pages_state_hold_cnt++; @@ -200,22 +270,14 @@ static s32 page_pool_inflight(struct page_pool *pool)  {  	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);  	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); -	s32 distance; - -	distance = _distance(hold_cnt, release_cnt); +	s32 inflight; -	trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt); -	return distance; -} - -static bool __page_pool_safe_to_destroy(struct page_pool *pool) -{ -	s32 inflight = page_pool_inflight(pool); +	inflight = _distance(hold_cnt, release_cnt); -	/* The distance should not be able to become negative */ +	trace_page_pool_release(pool, inflight, hold_cnt, release_cnt);  	WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight); -	return (inflight == 0); +	return inflight;  }  /* Cleanup page_pool state from page */ @@ -223,6 +285,7 @@ static void __page_pool_clean_page(struct page_pool *pool,  				   struct page *page)  {  	dma_addr_t dma; +	int count;  	if (!(pool->p.flags & PP_FLAG_DMA_MAP))  		goto skip_dma_unmap; @@ -234,9 +297,11 @@ static void __page_pool_clean_page(struct page_pool *pool,  			     DMA_ATTR_SKIP_CPU_SYNC);  	page->dma_addr = 0;  skip_dma_unmap: -	atomic_inc(&pool->pages_state_release_cnt); -	trace_page_pool_state_release(pool, page, -			      atomic_read(&pool->pages_state_release_cnt)); +	/* This may be the last page returned, releasing the pool, so +	 * it is not safe to reference pool afterwards. +	 */ +	count = atomic_inc_return(&pool->pages_state_release_cnt); +	trace_page_pool_state_release(pool, page, count);  }  /* unmap the page and clean our state */ @@ -290,8 +355,16 @@ static bool __page_pool_recycle_direct(struct page *page,  	return true;  } -void __page_pool_put_page(struct page_pool *pool, -			  struct page *page, bool allow_direct) +/* page is NOT reusable when: + * 1) allocated when system is under some pressure. (page_is_pfmemalloc) + */ +static bool pool_page_reusable(struct page_pool *pool, struct page *page) +{ +	return !page_is_pfmemalloc(page); +} + +void __page_pool_put_page(struct page_pool *pool, struct page *page, +			  unsigned int dma_sync_size, bool allow_direct)  {  	/* This allocator is optimized for the XDP mode that uses  	 * one-frame-per-page, but have fallbacks that act like the @@ -299,9 +372,14 @@ void __page_pool_put_page(struct page_pool *pool,  	 *  	 * refcnt == 1 means page_pool owns page, and can recycle it.  	 */ -	if (likely(page_ref_count(page) == 1)) { +	if (likely(page_ref_count(page) == 1 && +		   pool_page_reusable(pool, page))) {  		/* Read barrier done in page_ref_count / READ_ONCE */ +		if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) +			page_pool_dma_sync_for_device(pool, page, +						      dma_sync_size); +  		if (allow_direct && in_serving_softirq())  			if (__page_pool_recycle_direct(page, pool))  				return; @@ -345,31 +423,10 @@ static void __page_pool_empty_ring(struct page_pool *pool)  	}  } -static void __warn_in_flight(struct page_pool *pool) +static void page_pool_free(struct page_pool *pool)  { -	u32 release_cnt = atomic_read(&pool->pages_state_release_cnt); -	u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt); -	s32 distance; - -	distance = _distance(hold_cnt, release_cnt); - -	/* Drivers should fix this, but only problematic when DMA is used */ -	WARN(1, "Still in-flight pages:%d hold:%u released:%u", -	     distance, hold_cnt, release_cnt); -} - -void __page_pool_free(struct page_pool *pool) -{ -	/* Only last user actually free/release resources */ -	if (!page_pool_put(pool)) -		return; - -	WARN(pool->alloc.count, "API usage violation"); -	WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty"); - -	/* Can happen due to forced shutdown */ -	if (!__page_pool_safe_to_destroy(pool)) -		__warn_in_flight(pool); +	if (pool->disconnect) +		pool->disconnect(pool);  	ptr_ring_cleanup(&pool->ring, NULL); @@ -378,15 +435,14 @@ void __page_pool_free(struct page_pool *pool)  	kfree(pool);  } -EXPORT_SYMBOL(__page_pool_free); -/* Request to shutdown: release pages cached by page_pool, and check - * for in-flight pages - */ -bool __page_pool_request_shutdown(struct page_pool *pool) +static void page_pool_empty_alloc_cache_once(struct page_pool *pool)  {  	struct page *page; +	if (pool->destroy_cnt) +		return; +  	/* Empty alloc cache, assume caller made sure this is  	 * no-longer in use, and page_pool_alloc_pages() cannot be  	 * call concurrently. @@ -395,12 +451,91 @@ bool __page_pool_request_shutdown(struct page_pool *pool)  		page = pool->alloc.cache[--pool->alloc.count];  		__page_pool_return_page(pool, page);  	} +} + +static void page_pool_scrub(struct page_pool *pool) +{ +	page_pool_empty_alloc_cache_once(pool); +	pool->destroy_cnt++;  	/* No more consumers should exist, but producers could still  	 * be in-flight.  	 */  	__page_pool_empty_ring(pool); +} -	return __page_pool_safe_to_destroy(pool); +static int page_pool_release(struct page_pool *pool) +{ +	int inflight; + +	page_pool_scrub(pool); +	inflight = page_pool_inflight(pool); +	if (!inflight) +		page_pool_free(pool); + +	return inflight; +} + +static void page_pool_release_retry(struct work_struct *wq) +{ +	struct delayed_work *dwq = to_delayed_work(wq); +	struct page_pool *pool = container_of(dwq, typeof(*pool), release_dw); +	int inflight; + +	inflight = page_pool_release(pool); +	if (!inflight) +		return; + +	/* Periodic warning */ +	if (time_after_eq(jiffies, pool->defer_warn)) { +		int sec = (s32)((u32)jiffies - (u32)pool->defer_start) / HZ; + +		pr_warn("%s() stalled pool shutdown %d inflight %d sec\n", +			__func__, inflight, sec); +		pool->defer_warn = jiffies + DEFER_WARN_INTERVAL; +	} + +	/* Still not ready to be disconnected, retry later */ +	schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} + +void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *)) +{ +	refcount_inc(&pool->user_cnt); +	pool->disconnect = disconnect; +} + +void page_pool_destroy(struct page_pool *pool) +{ +	if (!pool) +		return; + +	if (!page_pool_put(pool)) +		return; + +	if (!page_pool_release(pool)) +		return; + +	pool->defer_start = jiffies; +	pool->defer_warn  = jiffies + DEFER_WARN_INTERVAL; + +	INIT_DELAYED_WORK(&pool->release_dw, page_pool_release_retry); +	schedule_delayed_work(&pool->release_dw, DEFER_TIME); +} +EXPORT_SYMBOL(page_pool_destroy); + +/* Caller must provide appropriate safe context, e.g. NAPI. */ +void page_pool_update_nid(struct page_pool *pool, int new_nid) +{ +	struct page *page; + +	trace_page_pool_update_nid(pool, new_nid); +	pool->p.nid = new_nid; + +	/* Flush pool alloc cache, as refill will check NUMA node */ +	while (pool->alloc.count) { +		page = pool->alloc.cache[--pool->alloc.count]; +		__page_pool_return_page(pool, page); +	}  } -EXPORT_SYMBOL(__page_pool_request_shutdown); +EXPORT_SYMBOL(page_pool_update_nid); | 
