diff options
Diffstat (limited to 'mm/vmscan.c')
| -rw-r--r-- | mm/vmscan.c | 236 | 
1 files changed, 163 insertions, 73 deletions
| diff --git a/mm/vmscan.c b/mm/vmscan.c index 3f56c8deb3c0..e01ded365440 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -11,6 +11,8 @@   *  Multiqueue VM started 5.8.00, Rik van Riel.   */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +  #include <linux/mm.h>  #include <linux/module.h>  #include <linux/gfp.h> @@ -43,6 +45,7 @@  #include <linux/sysctl.h>  #include <linux/oom.h>  #include <linux/prefetch.h> +#include <linux/printk.h>  #include <asm/tlbflush.h>  #include <asm/div64.h> @@ -83,6 +86,9 @@ struct scan_control {  	/* Scan (total_size >> priority) pages at once */  	int priority; +	/* anon vs. file LRUs scanning "ratio" */ +	int swappiness; +  	/*  	 * The memory cgroup that hit its limit and as a result is the  	 * primary target of this reclaim invocation. @@ -324,7 +330,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker,  	else  		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); -	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); +	trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);  	return freed;  } @@ -477,7 +483,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,  		if (page_has_private(page)) {  			if (try_to_free_buffers(page)) {  				ClearPageDirty(page); -				printk("%s: orphaned page\n", __func__); +				pr_info("%s: orphaned page\n", __func__);  				return PAGE_CLEAN;  			}  		} @@ -1121,7 +1127,7 @@ keep:  		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);  	} -	free_hot_cold_page_list(&free_pages, 1); +	free_hot_cold_page_list(&free_pages, true);  	list_splice(&ret_pages, page_list);  	count_vm_events(PGACTIVATE, pgactivate); @@ -1439,6 +1445,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)  }  /* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested.  In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ +	return !(current->flags & PF_LESS_THROTTLE) || +		current->backing_dev_info == NULL || +		bdi_write_congested(current->backing_dev_info); +} + +/*   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number   * of reclaimed pages   */ @@ -1519,7 +1538,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  	spin_unlock_irq(&zone->lru_lock); -	free_hot_cold_page_list(&page_list, 1); +	free_hot_cold_page_list(&page_list, true);  	/*  	 * If reclaim is isolating dirty pages under writeback, it implies @@ -1554,19 +1573,18 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  		 * If dirty pages are scanned that are not queued for IO, it  		 * implies that flushers are not keeping up. In this case, flag  		 * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing -		 * pages from reclaim context. It will forcibly stall in the -		 * next check. +		 * pages from reclaim context.  		 */  		if (nr_unqueued_dirty == nr_taken)  			zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY);  		/* -		 * In addition, if kswapd scans pages marked marked for -		 * immediate reclaim and under writeback (nr_immediate), it -		 * implies that pages are cycling through the LRU faster than +		 * If kswapd scans pages marked marked for immediate +		 * reclaim and under writeback (nr_immediate), it implies +		 * that pages are cycling through the LRU faster than  		 * they are written so also forcibly stall.  		 */ -		if (nr_unqueued_dirty == nr_taken || nr_immediate) +		if (nr_immediate && current_may_throttle())  			congestion_wait(BLK_RW_ASYNC, HZ/10);  	} @@ -1575,7 +1593,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,  	 * is congested. Allow kswapd to continue until it starts encountering  	 * unqueued dirty pages or cycling through the LRU too quickly.  	 */ -	if (!sc->hibernation_mode && !current_is_kswapd()) +	if (!sc->hibernation_mode && !current_is_kswapd() && +	    current_may_throttle())  		wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);  	trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1740,7 +1759,7 @@ static void shrink_active_list(unsigned long nr_to_scan,  	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);  	spin_unlock_irq(&zone->lru_lock); -	free_hot_cold_page_list(&l_hold, 1); +	free_hot_cold_page_list(&l_hold, true);  }  #ifdef CONFIG_SWAP @@ -1830,13 +1849,6 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,  	return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);  } -static int vmscan_swappiness(struct scan_control *sc) -{ -	if (global_reclaim(sc)) -		return vm_swappiness; -	return mem_cgroup_swappiness(sc->target_mem_cgroup); -} -  enum scan_balance {  	SCAN_EQUAL,  	SCAN_FRACT, @@ -1866,6 +1878,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	bool force_scan = false;  	unsigned long ap, fp;  	enum lru_list lru; +	bool some_scanned; +	int pass;  	/*  	 * If the zone or memcg is small, nr[l] can be 0.  This @@ -1895,7 +1909,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	 * using the memory controller's swap limit feature would be  	 * too expensive.  	 */ -	if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { +	if (!global_reclaim(sc) && !sc->swappiness) {  		scan_balance = SCAN_FILE;  		goto out;  	} @@ -1905,7 +1919,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	 * system is close to OOM, scan both anon and file equally  	 * (unless the swappiness setting disagrees with swapping).  	 */ -	if (!sc->priority && vmscan_swappiness(sc)) { +	if (!sc->priority && sc->swappiness) {  		scan_balance = SCAN_EQUAL;  		goto out;  	} @@ -1916,6 +1930,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  		get_lru_size(lruvec, LRU_INACTIVE_FILE);  	/* +	 * Prevent the reclaimer from falling into the cache trap: as +	 * cache pages start out inactive, every cache fault will tip +	 * the scan balance towards the file LRU.  And as the file LRU +	 * shrinks, so does the window for rotation from references. +	 * This means we have a runaway feedback loop where a tiny +	 * thrashing file LRU becomes infinitely more attractive than +	 * anon pages.  Try to detect this based on file LRU size. +	 */ +	if (global_reclaim(sc)) { +		unsigned long free = zone_page_state(zone, NR_FREE_PAGES); + +		if (unlikely(file + free <= high_wmark_pages(zone))) { +			scan_balance = SCAN_ANON; +			goto out; +		} +	} + +	/*  	 * There is enough inactive page cache, do not reclaim  	 * anything from the anonymous working set right now.  	 */ @@ -1930,7 +1962,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	 * With swappiness at 100, anonymous and file have the same priority.  	 * This scanning priority is essentially the inverse of IO cost.  	 */ -	anon_prio = vmscan_swappiness(sc); +	anon_prio = sc->swappiness;  	file_prio = 200 - anon_prio;  	/* @@ -1971,39 +2003,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,  	fraction[1] = fp;  	denominator = ap + fp + 1;  out: -	for_each_evictable_lru(lru) { -		int file = is_file_lru(lru); -		unsigned long size; -		unsigned long scan; +	some_scanned = false; +	/* Only use force_scan on second pass. */ +	for (pass = 0; !some_scanned && pass < 2; pass++) { +		for_each_evictable_lru(lru) { +			int file = is_file_lru(lru); +			unsigned long size; +			unsigned long scan; -		size = get_lru_size(lruvec, lru); -		scan = size >> sc->priority; +			size = get_lru_size(lruvec, lru); +			scan = size >> sc->priority; -		if (!scan && force_scan) -			scan = min(size, SWAP_CLUSTER_MAX); +			if (!scan && pass && force_scan) +				scan = min(size, SWAP_CLUSTER_MAX); -		switch (scan_balance) { -		case SCAN_EQUAL: -			/* Scan lists relative to size */ -			break; -		case SCAN_FRACT: +			switch (scan_balance) { +			case SCAN_EQUAL: +				/* Scan lists relative to size */ +				break; +			case SCAN_FRACT: +				/* +				 * Scan types proportional to swappiness and +				 * their relative recent reclaim efficiency. +				 */ +				scan = div64_u64(scan * fraction[file], +							denominator); +				break; +			case SCAN_FILE: +			case SCAN_ANON: +				/* Scan one type exclusively */ +				if ((scan_balance == SCAN_FILE) != file) +					scan = 0; +				break; +			default: +				/* Look ma, no brain */ +				BUG(); +			} +			nr[lru] = scan;  			/* -			 * Scan types proportional to swappiness and -			 * their relative recent reclaim efficiency. +			 * Skip the second pass and don't force_scan, +			 * if we found something to scan.  			 */ -			scan = div64_u64(scan * fraction[file], denominator); -			break; -		case SCAN_FILE: -		case SCAN_ANON: -			/* Scan one type exclusively */ -			if ((scan_balance == SCAN_FILE) != file) -				scan = 0; -			break; -		default: -			/* Look ma, no brain */ -			BUG(); +			some_scanned |= !!scan;  		} -		nr[lru] = scan;  	}  } @@ -2019,13 +2061,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  	unsigned long nr_reclaimed = 0;  	unsigned long nr_to_reclaim = sc->nr_to_reclaim;  	struct blk_plug plug; -	bool scan_adjusted = false; +	bool scan_adjusted;  	get_scan_count(lruvec, sc, nr);  	/* Record the original scan target for proportional adjustments later */  	memcpy(targets, nr, sizeof(nr)); +	/* +	 * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal +	 * event that can occur when there is little memory pressure e.g. +	 * multiple streaming readers/writers. Hence, we do not abort scanning +	 * when the requested number of pages are reclaimed when scanning at +	 * DEF_PRIORITY on the assumption that the fact we are direct +	 * reclaiming implies that kswapd is not keeping up and it is best to +	 * do a batch of work at once. For memcg reclaim one check is made to +	 * abort proportional reclaim if either the file or anon lru has already +	 * dropped to zero at the first pass. +	 */ +	scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && +			 sc->priority == DEF_PRIORITY); +  	blk_start_plug(&plug);  	while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||  					nr[LRU_INACTIVE_FILE]) { @@ -2046,17 +2102,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  			continue;  		/* -		 * For global direct reclaim, reclaim only the number of pages -		 * requested. Less care is taken to scan proportionally as it -		 * is more important to minimise direct reclaim stall latency -		 * than it is to properly age the LRU lists. -		 */ -		if (global_reclaim(sc) && !current_is_kswapd()) -			break; - -		/*  		 * For kswapd and memcg, reclaim at least the number of pages -		 * requested. Ensure that the anon and file LRUs shrink +		 * requested. Ensure that the anon and file LRUs are scanned  		 * proportionally what was requested by get_scan_count(). We  		 * stop reclaiming one LRU and reduce the amount scanning  		 * proportional to the original scan target. @@ -2064,6 +2111,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)  		nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];  		nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; +		/* +		 * It's just vindictive to attack the larger once the smaller +		 * has gone to zero.  And given the way we stop scanning the +		 * smaller below, this makes sure that we only make one nudge +		 * towards proportionality once we've got nr_to_reclaim. +		 */ +		if (!nr_file || !nr_anon) +			break; +  		if (nr_file > nr_anon) {  			unsigned long scan_target = targets[LRU_INACTIVE_ANON] +  						targets[LRU_ACTIVE_ANON] + 1; @@ -2206,6 +2262,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)  			lruvec = mem_cgroup_zone_lruvec(zone, memcg); +			sc->swappiness = mem_cgroup_swappiness(memcg);  			shrink_lruvec(lruvec, sc);  			/* @@ -2250,9 +2307,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)  	 * there is a buffer of free pages available to give compaction  	 * a reasonable chance of completing and allocating the page  	 */ -	balance_gap = min(low_wmark_pages(zone), -		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / -			KSWAPD_ZONE_BALANCE_GAP_RATIO); +	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( +			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));  	watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order);  	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2507,10 +2563,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)  	for (i = 0; i <= ZONE_NORMAL; i++) {  		zone = &pgdat->node_zones[i]; +		if (!populated_zone(zone)) +			continue; +  		pfmemalloc_reserve += min_wmark_pages(zone);  		free_pages += zone_page_state(zone, NR_FREE_PAGES);  	} +	/* If there are no reserves (unexpected config) then do not throttle */ +	if (!pfmemalloc_reserve) +		return true; +  	wmark_ok = free_pages > pfmemalloc_reserve / 2;  	/* kswapd must be awake if processes are being throttled */ @@ -2535,9 +2598,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat)  static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,  					nodemask_t *nodemask)  { +	struct zoneref *z;  	struct zone *zone; -	int high_zoneidx = gfp_zone(gfp_mask); -	pg_data_t *pgdat; +	pg_data_t *pgdat = NULL;  	/*  	 * Kernel threads should not be throttled as they may be indirectly @@ -2556,10 +2619,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,  	if (fatal_signal_pending(current))  		goto out; -	/* Check if the pfmemalloc reserves are ok */ -	first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); -	pgdat = zone->zone_pgdat; -	if (pfmemalloc_watermark_ok(pgdat)) +	/* +	 * Check if the pfmemalloc reserves are ok by finding the first node +	 * with a usable ZONE_NORMAL or lower zone. The expectation is that +	 * GFP_KERNEL will be required for allocating network buffers when +	 * swapping over the network so ZONE_HIGHMEM is unusable. +	 * +	 * Throttling is based on the first usable node and throttled processes +	 * wait on a queue until kswapd makes progress and wakes them. There +	 * is an affinity then between processes waking up and where reclaim +	 * progress has been made assuming the process wakes on the same node. +	 * More importantly, processes running on remote nodes will not compete +	 * for remote pfmemalloc reserves and processes on different nodes +	 * should make reasonable progress. +	 */ +	for_each_zone_zonelist_nodemask(zone, z, zonelist, +					gfp_mask, nodemask) { +		if (zone_idx(zone) > ZONE_NORMAL) +			continue; + +		/* Throttle based on the first usable node */ +		pgdat = zone->zone_pgdat; +		if (pfmemalloc_watermark_ok(pgdat)) +			goto out; +		break; +	} + +	/* If no zone was usable by the allocation flags then do not throttle */ +	if (!pgdat)  		goto out;  	/* Account for the throttling */ @@ -2642,6 +2729,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,  		.may_swap = !noswap,  		.order = 0,  		.priority = 0, +		.swappiness = mem_cgroup_swappiness(memcg),  		.target_mem_cgroup = memcg,  	};  	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); @@ -2873,9 +2961,8 @@ static bool kswapd_shrink_zone(struct zone *zone,  	 * high wmark plus a "gap" where the gap is either the low  	 * watermark or 1% of the zone, whichever is smaller.  	 */ -	balance_gap = min(low_wmark_pages(zone), -		(zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / -		KSWAPD_ZONE_BALANCE_GAP_RATIO); +	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( +			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));  	/*  	 * If there is no low memory pressure or the zone is balanced then no @@ -3284,7 +3371,10 @@ static int kswapd(void *p)  		}  	} +	tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);  	current->reclaim_state = NULL; +	lockdep_clear_current_reclaim_state(); +  	return 0;  } @@ -3404,7 +3494,7 @@ int kswapd_run(int nid)  /*   * Called by memory hotplug when all memory in a node is offlined.  Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end().   */  void kswapd_stop(int nid)  { | 
