diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 737 | 
1 files changed, 305 insertions, 432 deletions
| diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29501f040568..a2c7bcb0e6eb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -80,7 +80,7 @@ int do_swap_account __read_mostly;  #ifdef CONFIG_MEMCG_SWAP_ENABLED  static int really_do_swap_account __initdata = 1;  #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata;  #endif  #else @@ -357,10 +357,9 @@ struct mem_cgroup {  	struct cg_proto tcp_mem;  #endif  #if defined(CONFIG_MEMCG_KMEM) -	/* analogous to slab_common's slab_caches list. per-memcg */ +	/* analogous to slab_common's slab_caches list, but per-memcg; +	 * protected by memcg_slab_mutex */  	struct list_head memcg_slab_caches; -	/* Not a spinlock, we can take a lot of time walking the list */ -	struct mutex slab_caches_mutex;          /* Index in the kmem_cache->memcg_params->memcg_caches array */  	int kmemcg_id;  #endif @@ -527,18 +526,14 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)  static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)  { -	/* -	 * The ID of the root cgroup is 0, but memcg treat 0 as an -	 * invalid ID, so we return (cgroup_id + 1). -	 */ -	return memcg->css.cgroup->id + 1; +	return memcg->css.id;  }  static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)  {  	struct cgroup_subsys_state *css; -	css = css_from_id(id - 1, &memory_cgrp_subsys); +	css = css_from_id(id, &memory_cgrp_subsys);  	return mem_cgroup_from_css(css);  } @@ -571,7 +566,8 @@ void sock_update_memcg(struct sock *sk)  		memcg = mem_cgroup_from_task(current);  		cg_proto = sk->sk_prot->proto_cgroup(memcg);  		if (!mem_cgroup_is_root(memcg) && -		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { +		    memcg_proto_active(cg_proto) && +		    css_tryget_online(&memcg->css)) {  			sk->sk_cgrp = cg_proto;  		}  		rcu_read_unlock(); @@ -677,9 +673,11 @@ static void disarm_static_keys(struct mem_cgroup *memcg)  static void drain_all_stock_async(struct mem_cgroup *memcg);  static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) +mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)  { -	VM_BUG_ON((unsigned)nid >= nr_node_ids); +	int nid = zone_to_nid(zone); +	int zid = zone_idx(zone); +  	return &memcg->nodeinfo[nid]->zoneinfo[zid];  } @@ -689,12 +687,12 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)  }  static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) +mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)  {  	int nid = page_to_nid(page);  	int zid = page_zonenum(page); -	return mem_cgroup_zoneinfo(memcg, nid, zid); +	return &memcg->nodeinfo[nid]->zoneinfo[zid];  }  static struct mem_cgroup_tree_per_zone * @@ -712,11 +710,9 @@ soft_limit_tree_from_page(struct page *page)  	return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];  } -static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, -				struct mem_cgroup_per_zone *mz, -				struct mem_cgroup_tree_per_zone *mctz, -				unsigned long long new_usage_in_excess) +static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_zone *mz, +					 struct mem_cgroup_tree_per_zone *mctz, +					 unsigned long long new_usage_in_excess)  {  	struct rb_node **p = &mctz->rb_root.rb_node;  	struct rb_node *parent = NULL; @@ -746,10 +742,8 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg,  	mz->on_tree = true;  } -static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, -				struct mem_cgroup_per_zone *mz, -				struct mem_cgroup_tree_per_zone *mctz) +static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, +					 struct mem_cgroup_tree_per_zone *mctz)  {  	if (!mz->on_tree)  		return; @@ -757,13 +751,11 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,  	mz->on_tree = false;  } -static void -mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, -				struct mem_cgroup_per_zone *mz, -				struct mem_cgroup_tree_per_zone *mctz) +static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, +				       struct mem_cgroup_tree_per_zone *mctz)  {  	spin_lock(&mctz->lock); -	__mem_cgroup_remove_exceeded(memcg, mz, mctz); +	__mem_cgroup_remove_exceeded(mz, mctz);  	spin_unlock(&mctz->lock);  } @@ -773,16 +765,14 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  	unsigned long long excess;  	struct mem_cgroup_per_zone *mz;  	struct mem_cgroup_tree_per_zone *mctz; -	int nid = page_to_nid(page); -	int zid = page_zonenum(page); -	mctz = soft_limit_tree_from_page(page); +	mctz = soft_limit_tree_from_page(page);  	/*  	 * Necessary to update all ancestors when hierarchy is used.  	 * because their event counter is not touched.  	 */  	for (; memcg; memcg = parent_mem_cgroup(memcg)) { -		mz = mem_cgroup_zoneinfo(memcg, nid, zid); +		mz = mem_cgroup_page_zoneinfo(memcg, page);  		excess = res_counter_soft_limit_excess(&memcg->res);  		/*  		 * We have to update the tree if mz is on RB-tree or @@ -792,12 +782,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  			spin_lock(&mctz->lock);  			/* if on-tree, remove it */  			if (mz->on_tree) -				__mem_cgroup_remove_exceeded(memcg, mz, mctz); +				__mem_cgroup_remove_exceeded(mz, mctz);  			/*  			 * Insert again. mz->usage_in_excess will be updated.  			 * If excess is 0, no tree ops.  			 */ -			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); +			__mem_cgroup_insert_exceeded(mz, mctz, excess);  			spin_unlock(&mctz->lock);  		}  	} @@ -805,15 +795,15 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)  static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)  { -	int node, zone; -	struct mem_cgroup_per_zone *mz;  	struct mem_cgroup_tree_per_zone *mctz; +	struct mem_cgroup_per_zone *mz; +	int nid, zid; -	for_each_node(node) { -		for (zone = 0; zone < MAX_NR_ZONES; zone++) { -			mz = mem_cgroup_zoneinfo(memcg, node, zone); -			mctz = soft_limit_tree_node_zone(node, zone); -			mem_cgroup_remove_exceeded(memcg, mz, mctz); +	for_each_node(nid) { +		for (zid = 0; zid < MAX_NR_ZONES; zid++) { +			mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; +			mctz = soft_limit_tree_node_zone(nid, zid); +			mem_cgroup_remove_exceeded(mz, mctz);  		}  	}  } @@ -836,9 +826,9 @@ retry:  	 * we will to add it back at the end of reclaim to its correct  	 * position in the tree.  	 */ -	__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +	__mem_cgroup_remove_exceeded(mz, mctz);  	if (!res_counter_soft_limit_excess(&mz->memcg->res) || -		!css_tryget(&mz->memcg->css)) +	    !css_tryget_online(&mz->memcg->css))  		goto retry;  done:  	return mz; @@ -947,8 +937,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,  	__this_cpu_add(memcg->stat->nr_page_events, nr_pages);  } -unsigned long -mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) +unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)  {  	struct mem_cgroup_per_zone *mz; @@ -956,46 +945,38 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)  	return mz->lru_size[lru];  } -static unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, -			unsigned int lru_mask) -{ -	struct mem_cgroup_per_zone *mz; -	enum lru_list lru; -	unsigned long ret = 0; - -	mz = mem_cgroup_zoneinfo(memcg, nid, zid); - -	for_each_lru(lru) { -		if (BIT(lru) & lru_mask) -			ret += mz->lru_size[lru]; -	} -	return ret; -} - -static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, -			int nid, unsigned int lru_mask) +static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, +						  int nid, +						  unsigned int lru_mask)  { -	u64 total = 0; +	unsigned long nr = 0;  	int zid; -	for (zid = 0; zid < MAX_NR_ZONES; zid++) -		total += mem_cgroup_zone_nr_lru_pages(memcg, -						nid, zid, lru_mask); +	VM_BUG_ON((unsigned)nid >= nr_node_ids); -	return total; +	for (zid = 0; zid < MAX_NR_ZONES; zid++) { +		struct mem_cgroup_per_zone *mz; +		enum lru_list lru; + +		for_each_lru(lru) { +			if (!(BIT(lru) & lru_mask)) +				continue; +			mz = &memcg->nodeinfo[nid]->zoneinfo[zid]; +			nr += mz->lru_size[lru]; +		} +	} +	return nr;  }  static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,  			unsigned int lru_mask)  { +	unsigned long nr = 0;  	int nid; -	u64 total = 0;  	for_each_node_state(nid, N_MEMORY) -		total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); -	return total; +		nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); +	return nr;  }  static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, @@ -1077,10 +1058,19 @@ static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)  	rcu_read_lock();  	do { -		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); -		if (unlikely(!memcg)) +		/* +		 * Page cache insertions can happen withou an +		 * actual mm context, e.g. during disk probing +		 * on boot, loopback IO, acct() writes etc. +		 */ +		if (unlikely(!mm))  			memcg = root_mem_cgroup; -	} while (!css_tryget(&memcg->css)); +		else { +			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); +			if (unlikely(!memcg)) +				memcg = root_mem_cgroup; +		} +	} while (!css_tryget_online(&memcg->css));  	rcu_read_unlock();  	return memcg;  } @@ -1117,7 +1107,8 @@ skip_node:  	 */  	if (next_css) {  		if ((next_css == &root->css) || -		    ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) +		    ((next_css->flags & CSS_ONLINE) && +		     css_tryget_online(next_css)))  			return mem_cgroup_from_css(next_css);  		prev_css = next_css; @@ -1163,7 +1154,7 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,  		 * would be returned all the time.  		 */  		if (position && position != root && -				!css_tryget(&position->css)) +		    !css_tryget_online(&position->css))  			position = NULL;  	}  	return position; @@ -1234,11 +1225,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,  		int uninitialized_var(seq);  		if (reclaim) { -			int nid = zone_to_nid(reclaim->zone); -			int zid = zone_idx(reclaim->zone);  			struct mem_cgroup_per_zone *mz; -			mz = mem_cgroup_zoneinfo(root, nid, zid); +			mz = mem_cgroup_zone_zoneinfo(root, reclaim->zone);  			iter = &mz->reclaim_iter[reclaim->priority];  			if (prev && reclaim->generation != iter->generation) {  				iter->last_visited = NULL; @@ -1345,7 +1334,7 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,  		goto out;  	} -	mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); +	mz = mem_cgroup_zone_zoneinfo(memcg, zone);  	lruvec = &mz->lruvec;  out:  	/* @@ -1404,7 +1393,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)  	if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)  		pc->mem_cgroup = memcg = root_mem_cgroup; -	mz = page_cgroup_zoneinfo(memcg, page); +	mz = mem_cgroup_page_zoneinfo(memcg, page);  	lruvec = &mz->lruvec;  out:  	/* @@ -1542,7 +1531,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)  int mem_cgroup_swappiness(struct mem_cgroup *memcg)  {  	/* root ? */ -	if (!css_parent(&memcg->css)) +	if (mem_cgroup_disabled() || !memcg->css.parent)  		return vm_swappiness;  	return memcg->swappiness; @@ -1586,23 +1575,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)  }  /* - * 2 routines for checking "mem" is under move_account() or not. + * A routine for checking "mem" is under move_account() or not.   * - * mem_cgroup_stolen() -  checking whether a cgroup is mc.from or not. This - *			  is used for avoiding races in accounting.  If true, - *			  pc->mem_cgroup may be overwritten. - * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - *			  under hierarchy of moving cgroups. This is for - *			  waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move".   */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ -	VM_BUG_ON(!rcu_read_lock_held()); -	return atomic_read(&memcg->moving_account) > 0; -} -  static bool mem_cgroup_under_move(struct mem_cgroup *memcg)  {  	struct mem_cgroup *from; @@ -1645,7 +1623,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)   * Take this lock when   * - a code tries to modify page's memcg while it's USED.   * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too.   */  static void move_lock_mem_cgroup(struct mem_cgroup *memcg,  				  unsigned long *flags) @@ -2280,12 +2257,11 @@ cleanup:  }  /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics.   *   * Notes: Race condition   * - * We usually use page_cgroup_lock() for accessing page_cgroup member but + * We usually use lock_page_cgroup() for accessing page_cgroup member but   * it tends to be costly. But considering some conditions, we doesn't need   * to do so _always_.   * @@ -2299,8 +2275,8 @@ cleanup:   * by flags.   *   * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * small, we check memcg->moving_account and detect there are possibility + * of race or not. If there is, we take a lock.   */  void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2318,9 +2294,10 @@ again:  	 * If this memory cgroup is not under account moving, we don't  	 * need to take move_lock_mem_cgroup(). Because we already hold  	 * rcu_read_lock(), any calls to move_account will be delayed until -	 * rcu_read_unlock() if mem_cgroup_stolen() == true. +	 * rcu_read_unlock().  	 */ -	if (!mem_cgroup_stolen(memcg)) +	VM_BUG_ON(!rcu_read_lock_held()); +	if (atomic_read(&memcg->moving_account) <= 0)  		return;  	move_lock_mem_cgroup(memcg, flags); @@ -2428,7 +2405,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)   */  static void drain_local_stock(struct work_struct *dummy)  { -	struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); +	struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock);  	drain_stock(stock);  	clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);  } @@ -2675,7 +2652,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,  	 * free their memory.  	 */  	if (unlikely(test_thread_flag(TIF_MEMDIE) || -		     fatal_signal_pending(current))) +		     fatal_signal_pending(current) || +		     current->flags & PF_EXITING))  		goto bypass;  	if (unlikely(task_in_memcg_oom(current))) @@ -2789,9 +2767,9 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,  /*   * A helper function to get mem_cgroup from ID. must be called under - * rcu_read_lock().  The caller is responsible for calling css_tryget if - * the mem_cgroup is used for charging. (dropping refcnt from swap can be - * called against removed memcg.) + * rcu_read_lock().  The caller is responsible for calling + * css_tryget_online() if the mem_cgroup is used for charging. (dropping + * refcnt from swap can be called against removed memcg.)   */  static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)  { @@ -2814,14 +2792,14 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)  	lock_page_cgroup(pc);  	if (PageCgroupUsed(pc)) {  		memcg = pc->mem_cgroup; -		if (memcg && !css_tryget(&memcg->css)) +		if (memcg && !css_tryget_online(&memcg->css))  			memcg = NULL;  	} else if (PageSwapCache(page)) {  		ent.val = page_private(page);  		id = lookup_swap_cgroup_id(ent);  		rcu_read_lock();  		memcg = mem_cgroup_lookup(id); -		if (memcg && !css_tryget(&memcg->css)) +		if (memcg && !css_tryget_online(&memcg->css))  			memcg = NULL;  		rcu_read_unlock();  	} @@ -2903,6 +2881,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,  static DEFINE_MUTEX(set_limit_mutex);  #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); +  static DEFINE_MUTEX(activate_kmem_mutex);  static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -2935,10 +2919,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)  	print_slabinfo_header(m); -	mutex_lock(&memcg->slab_caches_mutex); +	mutex_lock(&memcg_slab_mutex);  	list_for_each_entry(params, &memcg->memcg_slab_caches, list)  		cache_show(memcg_params_to_cache(params), m); -	mutex_unlock(&memcg->slab_caches_mutex); +	mutex_unlock(&memcg_slab_mutex);  	return 0;  } @@ -3040,8 +3024,6 @@ void memcg_update_array_size(int num)  		memcg_limited_groups_array_size = memcg_caches_array_size(num);  } -static void kmem_cache_destroy_work_func(struct work_struct *w); -  int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  {  	struct memcg_cache_params *cur_params = s->memcg_params; @@ -3094,29 +3076,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)  	return 0;  } -char *memcg_create_cache_name(struct mem_cgroup *memcg, -			      struct kmem_cache *root_cache) -{ -	static char *buf = NULL; - -	/* -	 * We need a mutex here to protect the shared buffer. Since this is -	 * expected to be called only on cache creation, we can employ the -	 * slab_mutex for that purpose. -	 */ -	lockdep_assert_held(&slab_mutex); - -	if (!buf) { -		buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); -		if (!buf) -			return NULL; -	} - -	cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); -	return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, -			 memcg_cache_id(memcg), buf); -} -  int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,  			     struct kmem_cache *root_cache)  { @@ -3138,8 +3097,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,  	if (memcg) {  		s->memcg_params->memcg = memcg;  		s->memcg_params->root_cache = root_cache; -		INIT_WORK(&s->memcg_params->destroy, -				kmem_cache_destroy_work_func);  		css_get(&memcg->css);  	} else  		s->memcg_params->is_root_cache = true; @@ -3156,24 +3113,37 @@ void memcg_free_cache_params(struct kmem_cache *s)  	kfree(s->memcg_params);  } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_register_cache(struct mem_cgroup *memcg, +				 struct kmem_cache *root_cache)  { -	struct kmem_cache *root; -	struct mem_cgroup *memcg; +	static char memcg_name_buf[NAME_MAX + 1]; /* protected by +						     memcg_slab_mutex */ +	struct kmem_cache *cachep;  	int id; -	if (is_root_cache(s)) +	lockdep_assert_held(&memcg_slab_mutex); + +	id = memcg_cache_id(memcg); + +	/* +	 * Since per-memcg caches are created asynchronously on first +	 * allocation (see memcg_kmem_get_cache()), several threads can try to +	 * create the same cache, but only one of them may succeed. +	 */ +	if (cache_from_memcg_idx(root_cache, id))  		return; +	cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); +	cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);  	/* -	 * Holding the slab_mutex assures nobody will touch the memcg_caches -	 * array while we are modifying it. +	 * If we could not create a memcg cache, do not complain, because +	 * that's not critical at all as we can always proceed with the root +	 * cache.  	 */ -	lockdep_assert_held(&slab_mutex); +	if (!cachep) +		return; -	root = s->memcg_params->root_cache; -	memcg = s->memcg_params->memcg; -	id = memcg_cache_id(memcg); +	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);  	/*  	 * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3182,49 +3152,30 @@ void memcg_register_cache(struct kmem_cache *s)  	 */  	smp_wmb(); -	/* -	 * Initialize the pointer to this cache in its parent's memcg_params -	 * before adding it to the memcg_slab_caches list, otherwise we can -	 * fail to convert memcg_params_to_cache() while traversing the list. -	 */ -	VM_BUG_ON(root->memcg_params->memcg_caches[id]); -	root->memcg_params->memcg_caches[id] = s; - -	mutex_lock(&memcg->slab_caches_mutex); -	list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); -	mutex_unlock(&memcg->slab_caches_mutex); +	BUG_ON(root_cache->memcg_params->memcg_caches[id]); +	root_cache->memcg_params->memcg_caches[id] = cachep;  } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_unregister_cache(struct kmem_cache *cachep)  { -	struct kmem_cache *root; +	struct kmem_cache *root_cache;  	struct mem_cgroup *memcg;  	int id; -	if (is_root_cache(s)) -		return; +	lockdep_assert_held(&memcg_slab_mutex); -	/* -	 * Holding the slab_mutex assures nobody will touch the memcg_caches -	 * array while we are modifying it. -	 */ -	lockdep_assert_held(&slab_mutex); +	BUG_ON(is_root_cache(cachep)); -	root = s->memcg_params->root_cache; -	memcg = s->memcg_params->memcg; +	root_cache = cachep->memcg_params->root_cache; +	memcg = cachep->memcg_params->memcg;  	id = memcg_cache_id(memcg); -	mutex_lock(&memcg->slab_caches_mutex); -	list_del(&s->memcg_params->list); -	mutex_unlock(&memcg->slab_caches_mutex); +	BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); +	root_cache->memcg_params->memcg_caches[id] = NULL; -	/* -	 * Clear the pointer to this cache in its parent's memcg_params only -	 * after removing it from the memcg_slab_caches list, otherwise we can -	 * fail to convert memcg_params_to_cache() while traversing the list. -	 */ -	VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); -	root->memcg_params->memcg_caches[id] = NULL; +	list_del(&cachep->memcg_params->list); + +	kmem_cache_destroy(cachep);  }  /* @@ -3258,144 +3209,61 @@ static inline void memcg_resume_kmem_account(void)  	current->memcg_kmem_skip_account--;  } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ -	struct kmem_cache *cachep; -	struct memcg_cache_params *p; - -	p = container_of(w, struct memcg_cache_params, destroy); - -	cachep = memcg_params_to_cache(p); - -	/* -	 * If we get down to 0 after shrink, we could delete right away. -	 * However, memcg_release_pages() already puts us back in the workqueue -	 * in that case. If we proceed deleting, we'll get a dangling -	 * reference, and removing the object from the workqueue in that case -	 * is unnecessary complication. We are not a fast path. -	 * -	 * Note that this case is fundamentally different from racing with -	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in -	 * kmem_cache_shrink, not only we would be reinserting a dead cache -	 * into the queue, but doing so from inside the worker racing to -	 * destroy it. -	 * -	 * So if we aren't down to zero, we'll just schedule a worker and try -	 * again -	 */ -	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) -		kmem_cache_shrink(cachep); -	else -		kmem_cache_destroy(cachep); -} - -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ -	if (!cachep->memcg_params->dead) -		return; - -	/* -	 * There are many ways in which we can get here. -	 * -	 * We can get to a memory-pressure situation while the delayed work is -	 * still pending to run. The vmscan shrinkers can then release all -	 * cache memory and get us to destruction. If this is the case, we'll -	 * be executed twice, which is a bug (the second time will execute over -	 * bogus data). In this case, cancelling the work should be fine. -	 * -	 * But we can also get here from the worker itself, if -	 * kmem_cache_shrink is enough to shake all the remaining objects and -	 * get the page count to 0. In this case, we'll deadlock if we try to -	 * cancel the work (the worker runs with an internal lock held, which -	 * is the same lock we would hold for cancel_work_sync().) -	 * -	 * Since we can't possibly know who got us here, just refrain from -	 * running if there is already work pending -	 */ -	if (work_pending(&cachep->memcg_params->destroy)) -		return; -	/* -	 * We have to defer the actual destroying to a workqueue, because -	 * we might currently be in a context that cannot sleep. -	 */ -	schedule_work(&cachep->memcg_params->destroy); -} - -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s)  {  	struct kmem_cache *c;  	int i, failed = 0; -	/* -	 * If the cache is being destroyed, we trust that there is no one else -	 * requesting objects from it. Even if there are, the sanity checks in -	 * kmem_cache_destroy should caught this ill-case. -	 * -	 * Still, we don't want anyone else freeing memcg_caches under our -	 * noses, which can happen if a new memcg comes to life. As usual, -	 * we'll take the activate_kmem_mutex to protect ourselves against -	 * this. -	 */ -	mutex_lock(&activate_kmem_mutex); +	mutex_lock(&memcg_slab_mutex);  	for_each_memcg_cache_index(i) {  		c = cache_from_memcg_idx(s, i);  		if (!c)  			continue; -		/* -		 * We will now manually delete the caches, so to avoid races -		 * we need to cancel all pending destruction workers and -		 * proceed with destruction ourselves. -		 * -		 * kmem_cache_destroy() will call kmem_cache_shrink internally, -		 * and that could spawn the workers again: it is likely that -		 * the cache still have active pages until this very moment. -		 * This would lead us back to mem_cgroup_destroy_cache. -		 * -		 * But that will not execute at all if the "dead" flag is not -		 * set, so flip it down to guarantee we are in control. -		 */ -		c->memcg_params->dead = false; -		cancel_work_sync(&c->memcg_params->destroy); -		kmem_cache_destroy(c); +		memcg_unregister_cache(c);  		if (cache_from_memcg_idx(s, i))  			failed++;  	} -	mutex_unlock(&activate_kmem_mutex); +	mutex_unlock(&memcg_slab_mutex);  	return failed;  } -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg)  {  	struct kmem_cache *cachep; -	struct memcg_cache_params *params; +	struct memcg_cache_params *params, *tmp;  	if (!memcg_kmem_is_active(memcg))  		return; -	mutex_lock(&memcg->slab_caches_mutex); -	list_for_each_entry(params, &memcg->memcg_slab_caches, list) { +	mutex_lock(&memcg_slab_mutex); +	list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {  		cachep = memcg_params_to_cache(params); -		cachep->memcg_params->dead = true; -		schedule_work(&cachep->memcg_params->destroy); +		kmem_cache_shrink(cachep); +		if (atomic_read(&cachep->memcg_params->nr_pages) == 0) +			memcg_unregister_cache(cachep);  	} -	mutex_unlock(&memcg->slab_caches_mutex); +	mutex_unlock(&memcg_slab_mutex);  } -struct create_work { +struct memcg_register_cache_work {  	struct mem_cgroup *memcg;  	struct kmem_cache *cachep;  	struct work_struct work;  }; -static void memcg_create_cache_work_func(struct work_struct *w) +static void memcg_register_cache_func(struct work_struct *w)  { -	struct create_work *cw = container_of(w, struct create_work, work); +	struct memcg_register_cache_work *cw = +		container_of(w, struct memcg_register_cache_work, work);  	struct mem_cgroup *memcg = cw->memcg;  	struct kmem_cache *cachep = cw->cachep; -	kmem_cache_create_memcg(memcg, cachep); +	mutex_lock(&memcg_slab_mutex); +	memcg_register_cache(memcg, cachep); +	mutex_unlock(&memcg_slab_mutex); +  	css_put(&memcg->css);  	kfree(cw);  } @@ -3403,12 +3271,12 @@ static void memcg_create_cache_work_func(struct work_struct *w)  /*   * Enqueue the creation of a per-memcg kmem_cache.   */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, -					 struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, +					    struct kmem_cache *cachep)  { -	struct create_work *cw; +	struct memcg_register_cache_work *cw; -	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); +	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);  	if (cw == NULL) {  		css_put(&memcg->css);  		return; @@ -3417,17 +3285,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,  	cw->memcg = memcg;  	cw->cachep = cachep; -	INIT_WORK(&cw->work, memcg_create_cache_work_func); +	INIT_WORK(&cw->work, memcg_register_cache_func);  	schedule_work(&cw->work);  } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, -				       struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, +					  struct kmem_cache *cachep)  {  	/*  	 * We need to stop accounting when we kmalloc, because if the  	 * corresponding kmalloc cache is not yet created, the first allocation -	 * in __memcg_create_cache_enqueue will recurse. +	 * in __memcg_schedule_register_cache will recurse.  	 *  	 * However, it is better to enclose the whole function. Depending on  	 * the debugging options enabled, INIT_WORK(), for instance, can @@ -3436,9 +3304,27 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,  	 * the safest choice is to do it like this, wrapping the whole function.  	 */  	memcg_stop_kmem_account(); -	__memcg_create_cache_enqueue(memcg, cachep); +	__memcg_schedule_register_cache(memcg, cachep);  	memcg_resume_kmem_account();  } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ +	int res; + +	res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, +				PAGE_SIZE << order); +	if (!res) +		atomic_add(1 << order, &cachep->memcg_params->nr_pages); +	return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ +	memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); +	atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} +  /*   * Return the kmem_cache we're supposed to use for a slab allocation.   * We try to use the current memcg's version of the cache. @@ -3477,7 +3363,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,  	}  	/* The corresponding put will be done in the workqueue. */ -	if (!css_tryget(&memcg->css)) +	if (!css_tryget_online(&memcg->css))  		goto out;  	rcu_read_unlock(); @@ -3489,22 +3375,16 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,  	 *  	 * However, there are some clashes that can arrive from locking.  	 * For instance, because we acquire the slab_mutex while doing -	 * kmem_cache_dup, this means no further allocation could happen -	 * with the slab_mutex held. -	 * -	 * Also, because cache creation issue get_online_cpus(), this -	 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, -	 * that ends up reversed during cpu hotplug. (cpuset allocates -	 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, -	 * better to defer everything. +	 * memcg_create_kmem_cache, this means no further allocation +	 * could happen with the slab_mutex held. So it's better to +	 * defer everything.  	 */ -	memcg_create_cache_enqueue(memcg, cachep); +	memcg_schedule_register_cache(memcg, cachep);  	return cachep;  out:  	rcu_read_unlock();  	return cachep;  } -EXPORT_SYMBOL(__memcg_kmem_get_cache);  /*   * We need to verify if the allocation against current->mm->owner's memcg is @@ -3531,11 +3411,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)  	/*  	 * Disabling accounting is only relevant for some specific memcg  	 * internal allocations. Therefore we would initially not have such -	 * check here, since direct calls to the page allocator that are marked -	 * with GFP_KMEMCG only happen outside memcg core. We are mostly -	 * concerned with cache allocations, and by having this test at -	 * memcg_kmem_get_cache, we are already able to relay the allocation to -	 * the root cache and bypass the memcg cache altogether. +	 * check here, since direct calls to the page allocator that are +	 * accounted to kmemcg (alloc_kmem_pages and friends) only happen +	 * outside memcg core. We are mostly concerned with cache allocations, +	 * and by having this test at memcg_kmem_get_cache, we are already able +	 * to relay the allocation to the root cache and bypass the memcg cache +	 * altogether.  	 *  	 * There is one exception, though: the SLUB allocator does not create  	 * large order caches, but rather service large kmallocs directly from @@ -3622,7 +3503,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)  	memcg_uncharge_kmem(memcg, PAGE_SIZE << order);  }  #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)  {  }  #endif /* CONFIG_MEMCG_KMEM */ @@ -3958,17 +3839,9 @@ int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm,  		return 0;  	} -	/* -	 * Page cache insertions can happen without an actual mm -	 * context, e.g. during disk probing on boot. -	 */ -	if (unlikely(!mm)) -		memcg = root_mem_cgroup; -	else { -		memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); -		if (!memcg) -			return -ENOMEM; -	} +	memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); +	if (!memcg) +		return -ENOMEM;  	__mem_cgroup_commit_charge(memcg, page, 1, type, false);  	return 0;  } @@ -4250,8 +4123,8 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)  	memcg = mem_cgroup_lookup(id);  	if (memcg) {  		/* -		 * We uncharge this because swap is freed. -		 * This memcg can be obsolete one. We avoid calling css_tryget +		 * We uncharge this because swap is freed.  This memcg can +		 * be obsolete one. We avoid calling css_tryget_online().  		 */  		if (!mem_cgroup_is_root(memcg))  			res_counter_uncharge(&memcg->memsw, PAGE_SIZE); @@ -4705,7 +4578,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  					break;  			} while (1);  		} -		__mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); +		__mem_cgroup_remove_exceeded(mz, mctz);  		excess = res_counter_soft_limit_excess(&mz->memcg->res);  		/*  		 * One school of thought says that we should not add @@ -4716,7 +4589,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,  		 * term TODO.  		 */  		/* If excess == 0, no tree ops */ -		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); +		__mem_cgroup_insert_exceeded(mz, mctz, excess);  		spin_unlock(&mctz->lock);  		css_put(&mz->memcg->css);  		loop++; @@ -4783,9 +4656,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,  		if (mem_cgroup_move_parent(page, pc, memcg)) {  			/* found lock contention or "pc" is obsolete. */  			busy = page; -			cond_resched();  		} else  			busy = NULL; +		cond_resched();  	} while (!list_empty(list));  } @@ -4836,18 +4709,28 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)  	} while (usage > 0);  } +/* + * Test whether @memcg has children, dead or alive.  Note that this + * function doesn't care whether @memcg has use_hierarchy enabled and + * returns %true if there are child csses according to the cgroup + * hierarchy.  Testing use_hierarchy is the caller's responsiblity. + */  static inline bool memcg_has_children(struct mem_cgroup *memcg)  { -	lockdep_assert_held(&memcg_create_mutex); +	bool ret; +  	/* -	 * The lock does not prevent addition or deletion to the list -	 * of children, but it prevents a new child from being -	 * initialized based on this parent in css_online(), so it's -	 * enough to decide whether hierarchically inherited -	 * attributes can still be changed or not. +	 * The lock does not prevent addition or deletion of children, but +	 * it prevents a new child from being initialized based on this +	 * parent in css_online(), so it's enough to decide whether +	 * hierarchically inherited attributes can still be changed or not.  	 */ -	return memcg->use_hierarchy && -		!list_empty(&memcg->css.cgroup->children); +	lockdep_assert_held(&memcg_create_mutex); + +	rcu_read_lock(); +	ret = css_next_child(NULL, &memcg->css); +	rcu_read_unlock(); +	return ret;  }  /* @@ -4859,11 +4742,6 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)  {  	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; -	struct cgroup *cgrp = memcg->css.cgroup; - -	/* returns EBUSY if there is a task or if we come here twice. */ -	if (cgroup_has_tasks(cgrp) || !list_empty(&cgrp->children)) -		return -EBUSY;  	/* we call try-to-free pages for make this cgroup empty */  	lru_add_drain_all(); @@ -4883,20 +4761,19 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)  		}  	} -	lru_add_drain(); -	mem_cgroup_reparent_charges(memcg);  	return 0;  } -static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, -					unsigned int event) +static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, +					    char *buf, size_t nbytes, +					    loff_t off)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));  	if (mem_cgroup_is_root(memcg))  		return -EINVAL; -	return mem_cgroup_force_empty(memcg); +	return mem_cgroup_force_empty(memcg) ?: nbytes;  }  static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, @@ -4910,7 +4787,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,  {  	int retval = 0;  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); +	struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);  	mutex_lock(&memcg_create_mutex); @@ -4927,7 +4804,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,  	 */  	if ((!parent_memcg || !parent_memcg->use_hierarchy) &&  				(val == 1 || val == 0)) { -		if (list_empty(&memcg->css.cgroup->children)) +		if (!memcg_has_children(memcg))  			memcg->use_hierarchy = val;  		else  			retval = -EBUSY; @@ -5044,7 +4921,8 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,  	 * of course permitted.  	 */  	mutex_lock(&memcg_create_mutex); -	if (cgroup_has_tasks(memcg->css.cgroup) || memcg_has_children(memcg)) +	if (cgroup_has_tasks(memcg->css.cgroup) || +	    (memcg->use_hierarchy && memcg_has_children(memcg)))  		err = -EBUSY;  	mutex_unlock(&memcg_create_mutex);  	if (err) @@ -5061,13 +4939,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg,  	 * Make sure we have enough space for this cgroup in each root cache's  	 * memcg_params.  	 */ +	mutex_lock(&memcg_slab_mutex);  	err = memcg_update_all_caches(memcg_id + 1); +	mutex_unlock(&memcg_slab_mutex);  	if (err)  		goto out_rmid;  	memcg->kmemcg_id = memcg_id;  	INIT_LIST_HEAD(&memcg->memcg_slab_caches); -	mutex_init(&memcg->slab_caches_mutex);  	/*  	 * We couldn't have accounted to this cgroup, because it hasn't got the @@ -5145,17 +5024,18 @@ static int memcg_update_kmem_limit(struct mem_cgroup *memcg,   * The user of this function is...   * RES_LIMIT.   */ -static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, -			    char *buffer) +static ssize_t mem_cgroup_write(struct kernfs_open_file *of, +				char *buf, size_t nbytes, loff_t off)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));  	enum res_type type;  	int name;  	unsigned long long val;  	int ret; -	type = MEMFILE_TYPE(cft->private); -	name = MEMFILE_ATTR(cft->private); +	buf = strstrip(buf); +	type = MEMFILE_TYPE(of_cft(of)->private); +	name = MEMFILE_ATTR(of_cft(of)->private);  	switch (name) {  	case RES_LIMIT: @@ -5164,7 +5044,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  			break;  		}  		/* This function does all necessary parse...reuse it */ -		ret = res_counter_memparse_write_strategy(buffer, &val); +		ret = res_counter_memparse_write_strategy(buf, &val);  		if (ret)  			break;  		if (type == _MEM) @@ -5177,7 +5057,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  			return -EINVAL;  		break;  	case RES_SOFT_LIMIT: -		ret = res_counter_memparse_write_strategy(buffer, &val); +		ret = res_counter_memparse_write_strategy(buf, &val);  		if (ret)  			break;  		/* @@ -5194,7 +5074,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,  		ret = -EINVAL; /* should be BUG() ? */  		break;  	} -	return ret; +	return ret ?: nbytes;  }  static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, @@ -5207,8 +5087,8 @@ static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,  	if (!memcg->use_hierarchy)  		goto out; -	while (css_parent(&memcg->css)) { -		memcg = mem_cgroup_from_css(css_parent(&memcg->css)); +	while (memcg->css.parent) { +		memcg = mem_cgroup_from_css(memcg->css.parent);  		if (!memcg->use_hierarchy)  			break;  		tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5221,14 +5101,15 @@ out:  	*memsw_limit = min_memsw_limit;  } -static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) +static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf, +				size_t nbytes, loff_t off)  { -	struct mem_cgroup *memcg = mem_cgroup_from_css(css); +	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));  	int name;  	enum res_type type; -	type = MEMFILE_TYPE(event); -	name = MEMFILE_ATTR(event); +	type = MEMFILE_TYPE(of_cft(of)->private); +	name = MEMFILE_ATTR(of_cft(of)->private);  	switch (name) {  	case RES_MAX_USAGE: @@ -5253,7 +5134,7 @@ static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event)  		break;  	} -	return 0; +	return nbytes;  }  static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, @@ -5412,7 +5293,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)  		for_each_online_node(nid)  			for (zid = 0; zid < MAX_NR_ZONES; zid++) { -				mz = mem_cgroup_zoneinfo(memcg, nid, zid); +				mz = &memcg->nodeinfo[nid]->zoneinfo[zid];  				rstat = &mz->lruvec.reclaim_stat;  				recent_rotated[0] += rstat->recent_rotated[0]; @@ -5442,22 +5323,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,  				       struct cftype *cft, u64 val)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - -	if (val > 100 || !parent) -		return -EINVAL; - -	mutex_lock(&memcg_create_mutex); -	/* If under hierarchy, only empty-root can set this value */ -	if ((parent->use_hierarchy) || memcg_has_children(memcg)) { -		mutex_unlock(&memcg_create_mutex); +	if (val > 100)  		return -EINVAL; -	} - -	memcg->swappiness = val; -	mutex_unlock(&memcg_create_mutex); +	if (css->parent) +		memcg->swappiness = val; +	else +		vm_swappiness = val;  	return 0;  } @@ -5789,22 +5662,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,  	struct cftype *cft, u64 val)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css));  	/* cannot set to root cgroup and only 0 and 1 are allowed */ -	if (!parent || !((val == 0) || (val == 1))) +	if (!css->parent || !((val == 0) || (val == 1)))  		return -EINVAL; -	mutex_lock(&memcg_create_mutex); -	/* oom-kill-disable is a flag for subhierarchy. */ -	if ((parent->use_hierarchy) || memcg_has_children(memcg)) { -		mutex_unlock(&memcg_create_mutex); -		return -EINVAL; -	}  	memcg->oom_kill_disable = val;  	if (!val)  		memcg_oom_recover(memcg); -	mutex_unlock(&memcg_create_mutex); +  	return 0;  } @@ -5844,10 +5710,10 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)  	 * which is then paired with css_put during uncharge resp. here.  	 *  	 * Although this might sound strange as this path is called from -	 * css_offline() when the referencemight have dropped down to 0 -	 * and shouldn't be incremented anymore (css_tryget would fail) -	 * we do not have other options because of the kmem allocations -	 * lifetime. +	 * css_offline() when the referencemight have dropped down to 0 and +	 * shouldn't be incremented anymore (css_tryget_online() would +	 * fail) we do not have other options because of the kmem +	 * allocations lifetime.  	 */  	css_get(&memcg->css); @@ -5966,9 +5832,10 @@ static void memcg_event_ptable_queue_proc(struct file *file,   * Input must be in format '<event_fd> <control_fd> <args>'.   * Interpretation of args is defined by control file implementation.   */ -static int memcg_write_event_control(struct cgroup_subsys_state *css, -				     struct cftype *cft, char *buffer) +static ssize_t memcg_write_event_control(struct kernfs_open_file *of, +					 char *buf, size_t nbytes, loff_t off)  { +	struct cgroup_subsys_state *css = of_css(of);  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);  	struct mem_cgroup_event *event;  	struct cgroup_subsys_state *cfile_css; @@ -5979,15 +5846,17 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,  	char *endp;  	int ret; -	efd = simple_strtoul(buffer, &endp, 10); +	buf = strstrip(buf); + +	efd = simple_strtoul(buf, &endp, 10);  	if (*endp != ' ')  		return -EINVAL; -	buffer = endp + 1; +	buf = endp + 1; -	cfd = simple_strtoul(buffer, &endp, 10); +	cfd = simple_strtoul(buf, &endp, 10);  	if ((*endp != ' ') && (*endp != '\0'))  		return -EINVAL; -	buffer = endp + 1; +	buf = endp + 1;  	event = kzalloc(sizeof(*event), GFP_KERNEL);  	if (!event) @@ -6055,8 +5924,8 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,  	 * automatically removed on cgroup destruction but the removal is  	 * asynchronous, so take an extra ref on @css.  	 */ -	cfile_css = css_tryget_from_dir(cfile.file->f_dentry->d_parent, -					&memory_cgrp_subsys); +	cfile_css = css_tryget_online_from_dir(cfile.file->f_dentry->d_parent, +					       &memory_cgrp_subsys);  	ret = -EINVAL;  	if (IS_ERR(cfile_css))  		goto out_put_cfile; @@ -6065,7 +5934,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,  		goto out_put_cfile;  	} -	ret = event->register_event(memcg, event->eventfd, buffer); +	ret = event->register_event(memcg, event->eventfd, buf);  	if (ret)  		goto out_put_css; @@ -6078,7 +5947,7 @@ static int memcg_write_event_control(struct cgroup_subsys_state *css,  	fdput(cfile);  	fdput(efile); -	return 0; +	return nbytes;  out_put_css:  	css_put(css); @@ -6103,25 +5972,25 @@ static struct cftype mem_cgroup_files[] = {  	{  		.name = "max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), -		.trigger = mem_cgroup_reset, +		.write = mem_cgroup_reset,  		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), -		.write_string = mem_cgroup_write, +		.write = mem_cgroup_write,  		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "soft_limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), -		.write_string = mem_cgroup_write, +		.write = mem_cgroup_write,  		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "failcnt",  		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), -		.trigger = mem_cgroup_reset, +		.write = mem_cgroup_reset,  		.read_u64 = mem_cgroup_read_u64,  	},  	{ @@ -6130,7 +5999,7 @@ static struct cftype mem_cgroup_files[] = {  	},  	{  		.name = "force_empty", -		.trigger = mem_cgroup_force_empty_write, +		.write = mem_cgroup_force_empty_write,  	},  	{  		.name = "use_hierarchy", @@ -6140,7 +6009,7 @@ static struct cftype mem_cgroup_files[] = {  	},  	{  		.name = "cgroup.event_control",		/* XXX: for compat */ -		.write_string = memcg_write_event_control, +		.write = memcg_write_event_control,  		.flags = CFTYPE_NO_PREFIX,  		.mode = S_IWUGO,  	}, @@ -6173,7 +6042,7 @@ static struct cftype mem_cgroup_files[] = {  	{  		.name = "kmem.limit_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), -		.write_string = mem_cgroup_write, +		.write = mem_cgroup_write,  		.read_u64 = mem_cgroup_read_u64,  	},  	{ @@ -6184,13 +6053,13 @@ static struct cftype mem_cgroup_files[] = {  	{  		.name = "kmem.failcnt",  		.private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), -		.trigger = mem_cgroup_reset, +		.write = mem_cgroup_reset,  		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "kmem.max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), -		.trigger = mem_cgroup_reset, +		.write = mem_cgroup_reset,  		.read_u64 = mem_cgroup_read_u64,  	},  #ifdef CONFIG_SLABINFO @@ -6213,19 +6082,19 @@ static struct cftype memsw_cgroup_files[] = {  	{  		.name = "memsw.max_usage_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), -		.trigger = mem_cgroup_reset, +		.write = mem_cgroup_reset,  		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.limit_in_bytes",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), -		.write_string = mem_cgroup_write, +		.write = mem_cgroup_write,  		.read_u64 = mem_cgroup_read_u64,  	},  	{  		.name = "memsw.failcnt",  		.private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), -		.trigger = mem_cgroup_reset, +		.write = mem_cgroup_reset,  		.read_u64 = mem_cgroup_read_u64,  	},  	{ },	/* terminate */ @@ -6403,9 +6272,9 @@ static int  mem_cgroup_css_online(struct cgroup_subsys_state *css)  {  	struct mem_cgroup *memcg = mem_cgroup_from_css(css); -	struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); +	struct mem_cgroup *parent = mem_cgroup_from_css(css->parent); -	if (css->cgroup->id > MEM_CGROUP_ID_MAX) +	if (css->id > MEM_CGROUP_ID_MAX)  		return -ENOSPC;  	if (!parent) @@ -6490,7 +6359,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)  	css_for_each_descendant_post(iter, css)  		mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); -	mem_cgroup_destroy_all_caches(memcg); +	memcg_unregister_all_caches(memcg);  	vmpressure_cleanup(&memcg->vmpressure);  } @@ -6500,7 +6369,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)  	/*  	 * XXX: css_offline() would be where we should reparent all  	 * memory to prepare the cgroup for destruction.  However, -	 * memcg does not do css_tryget() and res_counter charging +	 * memcg does not do css_tryget_online() and res_counter charging  	 * under the same RCU lock region, which means that charging  	 * could race with offlining.  Offlining only happens to  	 * cgroups with no tasks in them but charges can show up @@ -6514,9 +6383,9 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)  	 *                           lookup_swap_cgroup_id()  	 *                           rcu_read_lock()  	 *                           mem_cgroup_lookup() -	 *                           css_tryget() +	 *                           css_tryget_online()  	 *                           rcu_read_unlock() -	 * disable css_tryget() +	 * disable css_tryget_online()  	 * call_rcu()  	 *   offline_css()  	 *     reparent_charges() @@ -6686,16 +6555,20 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,  		pgoff = pte_to_pgoff(ptent);  	/* page is moved even if it's not RSS of this task(page-faulted). */ -	page = find_get_page(mapping, pgoff); -  #ifdef CONFIG_SWAP  	/* shmem/tmpfs may report page out on swap: account for that too. */ -	if (radix_tree_exceptional_entry(page)) { -		swp_entry_t swap = radix_to_swp_entry(page); -		if (do_swap_account) -			*entry = swap; -		page = find_get_page(swap_address_space(swap), swap.val); -	} +	if (shmem_mapping(mapping)) { +		page = find_get_entry(mapping, pgoff); +		if (radix_tree_exceptional_entry(page)) { +			swp_entry_t swp = radix_to_swp_entry(page); +			if (do_swap_account) +				*entry = swp; +			page = find_get_page(swap_address_space(swp), swp.val); +		} +	} else +		page = find_get_page(mapping, pgoff); +#else +	page = find_get_page(mapping, pgoff);  #endif  	return page;  } | 
