From d9c10ddddc98db0a316243cd266c466875975a94 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 28 Mar 2013 08:48:14 +0100 Subject: memcg: fix memcg_cache_name() to use cgroup_name() As cgroup supports rename, it's unsafe to dereference dentry->d_name without proper vfs locks. Fix this by using cgroup_name() rather than dentry directly. Also open code memcg_cache_name because it is called only from kmem_cache_dup which frees the returned name right after kmem_cache_create_memcg makes a copy of it. Such a short-lived allocation doesn't make too much sense. So replace it by a static buffer as kmem_cache_dup is called with memcg_cache_mutex. Signed-off-by: Li Zefan Signed-off-by: Michal Hocko Acked-by: Glauber Costa Signed-off-by: Tejun Heo --- mm/memcontrol.c | 63 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 31 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 53b8201b31eb..9715c0c491b0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3214,52 +3214,53 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep) schedule_work(&cachep->memcg_params->destroy); } -static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s) -{ - char *name; - struct dentry *dentry; - - rcu_read_lock(); - dentry = rcu_dereference(memcg->css.cgroup->dentry); - rcu_read_unlock(); - - BUG_ON(dentry == NULL); - - name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name, - memcg_cache_id(memcg), dentry->d_name.name); - - return name; -} +/* + * This lock protects updaters, not readers. We want readers to be as fast as + * they can, and they will either see NULL or a valid cache value. Our model + * allow them to see NULL, in which case the root memcg will be selected. + * + * We need this lock because multiple allocations to the same cache from a non + * will span more than one worker. Only one of them can create the cache. + */ +static DEFINE_MUTEX(memcg_cache_mutex); +/* + * Called with memcg_cache_mutex held + */ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, struct kmem_cache *s) { - char *name; struct kmem_cache *new; + static char *tmp_name = NULL; - name = memcg_cache_name(memcg, s); - if (!name) - return NULL; + lockdep_assert_held(&memcg_cache_mutex); + + /* + * kmem_cache_create_memcg duplicates the given name and + * cgroup_name for this name requires RCU context. + * This static temporary buffer is used to prevent from + * pointless shortliving allocation. + */ + if (!tmp_name) { + tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); + if (!tmp_name) + return NULL; + } + + rcu_read_lock(); + snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, + memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); + rcu_read_unlock(); - new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align, + new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, (s->flags & ~SLAB_PANIC), s->ctor, s); if (new) new->allocflags |= __GFP_KMEMCG; - kfree(name); return new; } -/* - * This lock protects updaters, not readers. We want readers to be as fast as - * they can, and they will either see NULL or a valid cache value. Our model - * allow them to see NULL, in which case the root memcg will be selected. - * - * We need this lock because multiple allocations to the same cache from a non - * will span more than one worker. Only one of them can create the cache. - */ -static DEFINE_MUTEX(memcg_cache_mutex); static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *cachep) { -- cgit v1.2.3 From f00baae7ad6c5f1503528efa852f0be8e9513f0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 15 Apr 2013 13:41:15 -0700 Subject: memcg: force use_hierarchy if sane_behavior Turn on use_hierarchy by default if sane_behavior is specified and don't create .use_hierarchy file. It is debatable whether to remove .use_hierarchy file or make it ro as the former could make transition easier in certain cases; however, the behavior changes which will be gated by sane_behavior are intensive including changing basic meaning of certain control knobs in a few controllers and I don't really think keeping this piece would make things easier in any noticeable way, so let's remove it. v2: Explain that mem_cgroup_bind() doesn't have to worry about children as suggested by Michal Hocko. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki --- include/linux/cgroup.h | 3 +++ mm/memcontrol.c | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) (limited to 'mm/memcontrol.c') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 64047ae7fde1..cda7eb2239e1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -268,6 +268,9 @@ enum { * * - Remount is disallowed. * + * - memcg: use_hierarchy is on by default and the cgroup file for + * the flag is not created. + * * The followings are planned changes. * * - release_agent will be disallowed once replacement notification diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9715c0c491b0..df87bdd4d692 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5814,6 +5814,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "use_hierarchy", + .flags = CFTYPE_INSANE, .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, @@ -6784,6 +6785,21 @@ static void mem_cgroup_move_task(struct cgroup *cont, } #endif +/* + * Cgroup retains root cgroups across [un]mount cycles making it necessary + * to verify sane_behavior flag on each mount attempt. + */ +static void mem_cgroup_bind(struct cgroup *root) +{ + /* + * use_hierarchy is forced with sane_behavior. cgroup core + * guarantees that @root doesn't have any children, so turning it + * on for the root memcg is enough. + */ + if (cgroup_sane_behavior(root)) + mem_cgroup_from_cont(root)->use_hierarchy = true; +} + struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", .subsys_id = mem_cgroup_subsys_id, @@ -6794,6 +6810,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, + .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, .use_id = 1, -- cgit v1.2.3 From c40046f3ad5e877b18cc721aaa7906b98077bc2d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:14 -0700 Subject: memcg: keep prev's css alive for the whole mem_cgroup_iter The patchset tries to make mem_cgroup_iter saner in the way how it walks hierarchies. css->id based traversal is far from being ideal as it is not deterministic because it depends on the creation ordering. Additional to that css_id is considered a burden for cgroup maintainers because it is quite some code and memcg is the last user of it. After this series only the swap accounting uses css_id but that one will follow up later. Diffstat (if we exclude removed/added comments) looks quite promising. We got rid of some code: $ git diff mmotm... | grep -v "^[+-][[:space:]]*[/ ]\*" | diffstat b/include/linux/cgroup.h | 3 --- kernel/cgroup.c | 33 --------------------------------- mm/memcontrol.c | 4 +++- 3 files changed, 3 insertions(+), 37 deletions(-) The first patch is just preparatory and it changes when we release css of the previously returned memcg. Nothing controlversial. The second patch is the core of the patchset and it replaces css_get_next based on css_id by the generic cgroup pre-order. This brings some chalanges for the last visited group caching during the reclaim (mem_cgroup_per_zone::reclaim_iter). We have to use memcg pointers directly now which means that we have to keep a reference to those groups' css to keep them alive. I also folded iter_lock introduced by https://lkml.org/lkml/2013/1/3/295 in the previous version into this patch. Johannes felt the race I was describing should be mostly harmless and I haven't been able to trigger it so the lock doesn't deserve its own patch. It is still needed temporarily, though, because the reference counting on iter->last_visited depends on it. It will go away with the next patch. The next patch fixups an unbounded cgroup removal holdoff caused by the elevated css refcount. The issue has been observed by Ying Han. Johannes wasn't impressed by the previous version of the fix (https://lkml.org/lkml/2013/2/8/379) which cleaned up pending references during mem_cgroup_css_offline when a group is removed. He has suggested a different way when the iterator checks whether a cached memcg is still valid or no. More on that in the patch but the basic idea is that every memcg tracks the number removed subgroups and iterator records this number when a group is cached. These numbers are checked before iter->last_visited is about to be used and the iteration is restarted if it is invalid. The fourth and fifth patches are an attempt for simplification of the mem_cgroup_iter. css juggling is removed and the iteration logic is moved to a helper so that the reference counting and iteration are separated. The last patch just removes css_get_next as there is no user for it any longer. My testing looked as follows: A (use_hierarchy=1, limit_in_bytes=150M) /|\ 1 2 3 Children groups were created so that the number is never higher than 3 and their limits were random between 50-100M. Each group hosts a kernel build (starting with tar -xf so the tree is not shared and make -jNUM_CPUs/3) and terminated after random time - up to 5 minutes) and then it is removed. This should exercise both leaf and hierarchical reclaim as well as races with cgroup removals and debugging messages I added on top proved that. 100 groups were created during the test. This patch: css reference counting keeps the cgroup alive even though it has been already removed. mem_cgroup_iter relies on this fact and takes a reference to the returned group. The reference is then released on the next iteration or mem_cgroup_iter_break. mem_cgroup_iter currently releases the reference right after it gets the last css_id. This is correct because neither prev's memcg nor cgroup are accessed after then. This will change in the next patch so we need to hold the group alive a bit longer so let's move the css_put at the end of the function. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2b552224f5cf..661a2c679f64 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1100,12 +1100,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && !reclaim) id = css_id(&prev->css); - if (prev && prev != root) - css_put(&prev->css); - if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) - return NULL; + goto out_css_put; return root; } @@ -1121,7 +1118,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) - return NULL; + goto out_css_put; id = iter->position; } @@ -1143,8 +1140,12 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } if (prev && !css) - return NULL; + goto out_css_put; } +out_css_put: + if (prev && prev != root) + css_put(&prev->css); + return memcg; } -- cgit v1.2.3 From 542f85f9ae4acd17dca06f4390f54d411a387efd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:15 -0700 Subject: memcg: rework mem_cgroup_iter to use cgroup iterators mem_cgroup_iter curently relies on css->id when walking down a group hierarchy tree. This is really awkward because the tree walk depends on the groups creation ordering. The only guarantee is that a parent node is visited before its children. Example: 1) mkdir -p a a/d a/b/c 2) mkdir -a a/b/c a/d Will create the same trees but the tree walks will be different: 1) a, d, b, c 2) a, b, c, d Commit 574bd9f7c7c1 ("cgroup: implement generic child / descendant walk macros") has introduced generic cgroup tree walkers which provide either pre-order or post-order tree walk. This patch converts css->id based iteration to pre-order tree walk to keep the semantic with the original iterator where parent is always visited before its subtree. cgroup_for_each_descendant_pre suggests using post_create and pre_destroy for proper synchronization with groups addidition resp. removal. This implementation doesn't use those because a new memory cgroup is initialized sufficiently for iteration in mem_cgroup_css_alloc already and css reference counting enforces that the group is alive for both the last seen cgroup and the found one resp. it signals that the group is dead and it should be skipped. If the reclaim cookie is used we need to store the last visited group into the iterator so we have to be careful that it doesn't disappear in the mean time. Elevated reference count on the css keeps it alive even though the group have been removed (parked waiting for the last dput so that it can be freed). Per node-zone-prio iter_lock has been introduced to ensure that css_tryget and iter->last_visited is set atomically. Otherwise two racing walkers could both take a references and only one release it leading to a css leak (which pins cgroup dentry). Signed-off-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 86 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 68 insertions(+), 18 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 661a2c679f64..26a38b7c7739 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -152,10 +152,12 @@ struct mem_cgroup_stat_cpu { }; struct mem_cgroup_reclaim_iter { - /* css_id of the last scanned hierarchy member */ - int position; + /* last scanned hierarchy member with elevated css ref count */ + struct mem_cgroup *last_visited; /* scan generation, increased every round-trip */ unsigned int generation; + /* lock to protect the position and generation */ + spinlock_t iter_lock; }; /* @@ -1089,7 +1091,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup *memcg = NULL; - int id = 0; + struct mem_cgroup *last_visited = NULL; if (mem_cgroup_disabled()) return NULL; @@ -1098,7 +1100,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, root = root_mem_cgroup; if (prev && !reclaim) - id = css_id(&prev->css); + last_visited = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) @@ -1106,9 +1108,10 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, return root; } + rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *css = NULL; if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1117,31 +1120,74 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - if (prev && reclaim->generation != iter->generation) - goto out_css_put; - id = iter->position; + spin_lock(&iter->iter_lock); + last_visited = iter->last_visited; + if (prev && reclaim->generation != iter->generation) { + if (last_visited) { + css_put(&last_visited->css); + iter->last_visited = NULL; + } + spin_unlock(&iter->iter_lock); + goto out_unlock; + } } - rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); - if (css) { - if (css == &root->css || css_tryget(css)) - memcg = mem_cgroup_from_css(css); - } else - id = 0; - rcu_read_unlock(); + /* + * Root is not visited by cgroup iterators so it needs an + * explicit visit. + */ + if (!last_visited) { + css = &root->css; + } else { + struct cgroup *prev_cgroup, *next_cgroup; + + prev_cgroup = (last_visited == root) ? NULL + : last_visited->css.cgroup; + next_cgroup = cgroup_next_descendant_pre(prev_cgroup, + root->css.cgroup); + if (next_cgroup) + css = cgroup_subsys_state(next_cgroup, + mem_cgroup_subsys_id); + } + + /* + * Even if we found a group we have to make sure it is alive. + * css && !memcg means that the groups should be skipped and + * we should continue the tree walk. + * last_visited css is safe to use because it is protected by + * css_get and the tree walk is rcu safe. + */ + if (css == &root->css || (css && css_tryget(css))) + memcg = mem_cgroup_from_css(css); if (reclaim) { - iter->position = id; + struct mem_cgroup *curr = memcg; + + if (last_visited) + css_put(&last_visited->css); + + if (css && !memcg) + curr = mem_cgroup_from_css(css); + + /* make sure that the cached memcg is not removed */ + if (curr) + css_get(&curr->css); + iter->last_visited = curr; + if (!css) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; + spin_unlock(&iter->iter_lock); + } else if (css && !memcg) { + last_visited = mem_cgroup_from_css(css); } if (prev && !css) - goto out_css_put; + goto out_unlock; } +out_unlock: + rcu_read_unlock(); out_css_put: if (prev && prev != root) css_put(&prev->css); @@ -5929,8 +5975,12 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) return 1; for (zone = 0; zone < MAX_NR_ZONES; zone++) { + int prio; + mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); + for (prio = 0; prio < DEF_PRIORITY + 1; prio++) + spin_lock_init(&mz->reclaim_iter[prio].iter_lock); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; -- cgit v1.2.3 From 5f57816197186378449b848e99ca9cf41a0055f1 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:17 -0700 Subject: memcg: relax memcg iter caching Now that the per-node-zone-priority iterator caches memory cgroups rather than their css ids we have to be careful and remove them from the iterator when they are on the way out otherwise they might live for unbounded amount of time even though their group is already gone (until the global/targeted reclaim triggers the zone under priority to find out the group is dead and let it to find the final rest). We can fix this issue by relaxing rules for the last_visited memcg. Instead of taking a reference to the css before it is stored into iter->last_visited we can just store its pointer and track the number of removed groups from each memcg's subhierarchy. This number would be stored into iterator everytime when a memcg is cached. If the iter count doesn't match the curent walker root's one we will start from the root again. The group counter is incremented upwards the hierarchy every time a group is removed. The iter_lock can be dropped because racing iterators cannot leak the reference anymore as the reference count is not elevated for last_visited when it is cached. Locking rules got a bit complicated by this change though. The iterator primarily relies on rcu read lock which makes sure that once we see a valid last_visited pointer then it will be valid for the whole RCU walk. smp_rmb makes sure that dead_count is read before last_visited and last_dead_count while smp_wmb makes sure that last_visited is updated before last_dead_count so the up-to-date last_dead_count cannot point to an outdated last_visited. css_tryget then makes sure that the last_visited is still alive in case the iteration races with the cached group removal (css is invalidated before mem_cgroup_css_offline increments dead_count). In short: mem_cgroup_iter rcu_read_lock() dead_count = atomic_read(parent->dead_count) smp_rmb() if (dead_count != iter->last_dead_count) last_visited POSSIBLY INVALID -> last_visited = NULL if (!css_tryget(iter->last_visited)) last_visited DEAD -> last_visited = NULL next = find_next(last_visited) css_tryget(next) css_put(last_visited) // css would be invalidated and parent->dead_count // incremented if this was the last reference iter->last_visited = next smp_wmb() iter->last_dead_count = dead_count rcu_read_unlock() cgroup_rmdir cgroup_destroy_locked atomic_add(CSS_DEACT_BIAS, &css->refcnt) // subsequent css_tryget fail mem_cgroup_css_offline mem_cgroup_invalidate_reclaim_iterators while(parent = parent_mem_cgroup) atomic_inc(parent->dead_count) css_put(css) // last reference held by cgroup core Spotted by Ying Han. Original idea from Johannes Weiner. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Ying Han Cc: Li Zefan Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 69 +++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 17 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 26a38b7c7739..408a5c75d77d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -152,12 +152,15 @@ struct mem_cgroup_stat_cpu { }; struct mem_cgroup_reclaim_iter { - /* last scanned hierarchy member with elevated css ref count */ + /* + * last scanned hierarchy member. Valid only if last_dead_count + * matches memcg->dead_count of the hierarchy root group. + */ struct mem_cgroup *last_visited; + unsigned long last_dead_count; + /* scan generation, increased every round-trip */ unsigned int generation; - /* lock to protect the position and generation */ - spinlock_t iter_lock; }; /* @@ -337,6 +340,7 @@ struct mem_cgroup { struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; + atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct tcp_memcontrol tcp_mem; #endif @@ -1092,6 +1096,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; + unsigned long uninitialized_var(dead_count); if (mem_cgroup_disabled()) return NULL; @@ -1120,16 +1125,33 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, mz = mem_cgroup_zoneinfo(root, nid, zid); iter = &mz->reclaim_iter[reclaim->priority]; - spin_lock(&iter->iter_lock); last_visited = iter->last_visited; if (prev && reclaim->generation != iter->generation) { - if (last_visited) { - css_put(&last_visited->css); - iter->last_visited = NULL; - } - spin_unlock(&iter->iter_lock); + iter->last_visited = NULL; goto out_unlock; } + + /* + * If the dead_count mismatches, a destruction + * has happened or is happening concurrently. + * If the dead_count matches, a destruction + * might still happen concurrently, but since + * we checked under RCU, that destruction + * won't free the object until we release the + * RCU reader lock. Thus, the dead_count + * check verifies the pointer is still valid, + * css_tryget() verifies the cgroup pointed to + * is alive. + */ + dead_count = atomic_read(&root->dead_count); + smp_rmb(); + last_visited = iter->last_visited; + if (last_visited) { + if ((dead_count != iter->last_dead_count) || + !css_tryget(&last_visited->css)) { + last_visited = NULL; + } + } } /* @@ -1169,16 +1191,14 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (css && !memcg) curr = mem_cgroup_from_css(css); - /* make sure that the cached memcg is not removed */ - if (curr) - css_get(&curr->css); iter->last_visited = curr; + smp_wmb(); + iter->last_dead_count = dead_count; if (!css) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; - spin_unlock(&iter->iter_lock); } else if (css && !memcg) { last_visited = mem_cgroup_from_css(css); } @@ -5975,12 +5995,8 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) return 1; for (zone = 0; zone < MAX_NR_ZONES; zone++) { - int prio; - mz = &pn->zoneinfo[zone]; lruvec_init(&mz->lruvec); - for (prio = 0; prio < DEF_PRIORITY + 1; prio++) - spin_lock_init(&mz->reclaim_iter[prio].iter_lock); mz->usage_in_excess = 0; mz->on_tree = false; mz->memcg = memcg; @@ -6235,10 +6251,29 @@ mem_cgroup_css_online(struct cgroup *cont) return error; } +/* + * Announce all parents that a group from their hierarchy is gone. + */ +static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) +{ + struct mem_cgroup *parent = memcg; + + while ((parent = parent_mem_cgroup(parent))) + atomic_inc(&parent->dead_count); + + /* + * if the root memcg is not hierarchical we have to check it + * explicitely. + */ + if (!root_mem_cgroup->use_hierarchy) + atomic_inc(&root_mem_cgroup->dead_count); +} + static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); } -- cgit v1.2.3 From 19f39402864ea3935b36ab1066c6283d6ea53f44 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:18 -0700 Subject: memcg: simplify mem_cgroup_iter The current implementation of mem_cgroup_iter has to consider both css and memcg to find out whether no group has been found (css==NULL - aka the loop is completed) and that no memcg is associated with the found node (!memcg - aka css_tryget failed because the group is no longer alive). This leads to awkward tweaks like tests for css && !memcg to skip the current node. It will be much easier if we got rid off css variable altogether and only rely on memcg. In order to do that the iteration part has to skip dead nodes. This sounds natural to me and as a nice side effect we will get a simple invariant that memcg is always alive when non-NULL and all nodes have been visited otherwise. We could get rid of the surrounding while loop but keep it in for now to make review easier. It will go away in the following patch. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 52 +++++++++++++++++++++++++--------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 408a5c75d77d..614d0d9c0deb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1116,7 +1116,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); - struct cgroup_subsys_state *css = NULL; if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1159,51 +1158,50 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, * explicit visit. */ if (!last_visited) { - css = &root->css; + memcg = root; } else { struct cgroup *prev_cgroup, *next_cgroup; prev_cgroup = (last_visited == root) ? NULL : last_visited->css.cgroup; - next_cgroup = cgroup_next_descendant_pre(prev_cgroup, - root->css.cgroup); - if (next_cgroup) - css = cgroup_subsys_state(next_cgroup, - mem_cgroup_subsys_id); - } +skip_node: + next_cgroup = cgroup_next_descendant_pre( + prev_cgroup, root->css.cgroup); - /* - * Even if we found a group we have to make sure it is alive. - * css && !memcg means that the groups should be skipped and - * we should continue the tree walk. - * last_visited css is safe to use because it is protected by - * css_get and the tree walk is rcu safe. - */ - if (css == &root->css || (css && css_tryget(css))) - memcg = mem_cgroup_from_css(css); + /* + * Even if we found a group we have to make sure it is + * alive. css && !memcg means that the groups should be + * skipped and we should continue the tree walk. + * last_visited css is safe to use because it is + * protected by css_get and the tree walk is rcu safe. + */ + if (next_cgroup) { + struct mem_cgroup *mem = mem_cgroup_from_cont( + next_cgroup); + if (css_tryget(&mem->css)) + memcg = mem; + else { + prev_cgroup = next_cgroup; + goto skip_node; + } + } + } if (reclaim) { - struct mem_cgroup *curr = memcg; - if (last_visited) css_put(&last_visited->css); - if (css && !memcg) - curr = mem_cgroup_from_css(css); - - iter->last_visited = curr; + iter->last_visited = memcg; smp_wmb(); iter->last_dead_count = dead_count; - if (!css) + if (!memcg) iter->generation++; else if (!prev && memcg) reclaim->generation = iter->generation; - } else if (css && !memcg) { - last_visited = mem_cgroup_from_css(css); } - if (prev && !css) + if (prev && !memcg) goto out_unlock; } out_unlock: -- cgit v1.2.3 From 16248d8fe65ea116cc30b915e05a1c29496da120 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:19 -0700 Subject: memcg: further simplify mem_cgroup_iter mem_cgroup_iter basically does two things currently. It takes care of the house keeping (reference counting, raclaim cookie) and it iterates through a hierarchy tree (by using cgroup generic tree walk). The code would be much more easier to follow if we move the iteration outside of the function (to __mem_cgrou_iter_next) so the distinction is more clear. This patch doesn't introduce any functional changes. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Li Zefan Cc: Ying Han Cc: Tejun Heo Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 79 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 614d0d9c0deb..2bdac3ececd0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1073,6 +1073,51 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return memcg; } +/* + * Returns a next (in a pre-order walk) alive memcg (with elevated css + * ref. count) or NULL if the whole root's subtree has been visited. + * + * helper function to be used by mem_cgroup_iter + */ +static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, + struct mem_cgroup *last_visited) +{ + struct cgroup *prev_cgroup, *next_cgroup; + + /* + * Root is not visited by cgroup iterators so it needs an + * explicit visit. + */ + if (!last_visited) + return root; + + prev_cgroup = (last_visited == root) ? NULL + : last_visited->css.cgroup; +skip_node: + next_cgroup = cgroup_next_descendant_pre( + prev_cgroup, root->css.cgroup); + + /* + * Even if we found a group we have to make sure it is + * alive. css && !memcg means that the groups should be + * skipped and we should continue the tree walk. + * last_visited css is safe to use because it is + * protected by css_get and the tree walk is rcu safe. + */ + if (next_cgroup) { + struct mem_cgroup *mem = mem_cgroup_from_cont( + next_cgroup); + if (css_tryget(&mem->css)) + return mem; + else { + prev_cgroup = next_cgroup; + goto skip_node; + } + } + + return NULL; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1153,39 +1198,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, } } - /* - * Root is not visited by cgroup iterators so it needs an - * explicit visit. - */ - if (!last_visited) { - memcg = root; - } else { - struct cgroup *prev_cgroup, *next_cgroup; - - prev_cgroup = (last_visited == root) ? NULL - : last_visited->css.cgroup; -skip_node: - next_cgroup = cgroup_next_descendant_pre( - prev_cgroup, root->css.cgroup); - - /* - * Even if we found a group we have to make sure it is - * alive. css && !memcg means that the groups should be - * skipped and we should continue the tree walk. - * last_visited css is safe to use because it is - * protected by css_get and the tree walk is rcu safe. - */ - if (next_cgroup) { - struct mem_cgroup *mem = mem_cgroup_from_cont( - next_cgroup); - if (css_tryget(&mem->css)) - memcg = mem; - else { - prev_cgroup = next_cgroup; - goto skip_node; - } - } - } + memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { if (last_visited) -- cgit v1.2.3 From acb6d558f4c8fbacc8e774c9d7737220a3777882 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 29 Apr 2013 15:07:43 -0700 Subject: memcg: do not check for do_swap_account in mem_cgroup_{read,write,reset} Since commit 2d11085e404f ("memcg: do not create memsw files if swap accounting is disabled") memsw files are created only if memcg swap accounting is enabled so it doesn't make any sense to check for it explicitly in mem_cgroup_read(), mem_cgroup_write() and mem_cgroup_reset(). Signed-off-by: Michal Hocko Cc: Kamezawa Hiroyuki Cc: Tejun Heo Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bdac3ececd0..d280016039f2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5025,9 +5025,6 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (type) { case _MEM: if (name == RES_USAGE) @@ -5162,9 +5159,6 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (name) { case RES_LIMIT: if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ @@ -5241,9 +5235,6 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); - if (!do_swap_account && type == _MEMSWAP) - return -EOPNOTSUPP; - switch (name) { case RES_MAX_USAGE: if (type == _MEM) -- cgit v1.2.3 From 573b400d01b3411c2436e73d8e63fc1186b94535 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 29 Apr 2013 15:08:13 -0700 Subject: mm/memcontrol.c: remove unnecessary ; Just a trivial issue I stumbled on while doing something else... Signed-off-by: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d280016039f2..7e5bc43c2d1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5813,7 +5813,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return ret; return mem_cgroup_sockets_init(memcg, ss); -}; +} static void kmem_cgroup_destroy(struct mem_cgroup *memcg) { -- cgit v1.2.3 From 70ddf637eebe47e61fb2be08a59315581b6d2f38 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Mon, 29 Apr 2013 15:08:31 -0700 Subject: memcg: add memory.pressure_level events With this patch userland applications that want to maintain the interactivity/memory allocation cost can use the pressure level notifications. The levels are defined like this: The "low" level means that the system is reclaiming memory for new allocations. Monitoring this reclaiming activity might be useful for maintaining cache level. Upon notification, the program (typically "Activity Manager") might analyze vmstat and act in advance (i.e. prematurely shutdown unimportant services). The "medium" level means that the system is experiencing medium memory pressure, the system might be making swap, paging out active file caches, etc. Upon this event applications may decide to further analyze vmstat/zoneinfo/memcg or internal memory usage statistics and free any resources that can be easily reconstructed or re-read from a disk. The "critical" level means that the system is actively thrashing, it is about to out of memory (OOM) or even the in-kernel OOM killer is on its way to trigger. Applications should do whatever they can to help the system. It might be too late to consult with vmstat or any other statistics, so it's advisable to take an immediate action. The events are propagated upward until the event is handled, i.e. the events are not pass-through. Here is what this means: for example you have three cgroups: A->B->C. Now you set up an event listener on cgroups A, B and C, and suppose group C experiences some pressure. In this situation, only group C will receive the notification, i.e. groups A and B will not receive it. This is done to avoid excessive "broadcasting" of messages, which disturbs the system and which is especially bad if we are low on memory or thrashing. So, organize the cgroups wisely, or propagate the events manually (or, ask us to implement the pass-through events, explaining why would you need them.) Performance wise, the memory pressure notifications feature itself is lightweight and does not require much of bookkeeping, in contrast to the rest of memcg features. Unfortunately, as of current memcg implementation, pages accounting is an inseparable part and cannot be turned off. The good news is that there are some efforts[1] to improve the situation; plus, implementing the same, fully API-compatible[2] interface for CONFIG_MEMCG=n case (e.g. embedded) is also a viable option, so it will not require any changes on the userland side. [1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291 [2] http://lkml.org/lkml/2013/2/21/454 [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings] Signed-off-by: Anton Vorontsov Acked-by: Kirill A. Shutemov Acked-by: KAMEZAWA Hiroyuki Cc: Tejun Heo Cc: David Rientjes Cc: Pekka Enberg Cc: Mel Gorman Cc: Glauber Costa Cc: Michal Hocko Cc: Luiz Capitulino Cc: Greg Thelen Cc: Leonid Moiseichuk Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Cc: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 70 +++++++- include/linux/vmpressure.h | 47 +++++ mm/Makefile | 2 +- mm/memcontrol.c | 29 +++ mm/vmpressure.c | 374 +++++++++++++++++++++++++++++++++++++++ mm/vmscan.c | 8 + 6 files changed, 528 insertions(+), 2 deletions(-) create mode 100644 include/linux/vmpressure.h create mode 100644 mm/vmpressure.c (limited to 'mm/memcontrol.c') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 8b8c28b9864c..f336ede58e62 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -40,6 +40,7 @@ Features: - soft limit - moving (recharging) account at moving a task is selectable. - usage threshold notifier + - memory pressure notifier - oom-killer disable knob and oom-notifier - Root cgroup has no limit controls. @@ -65,6 +66,7 @@ Brief summary of control files. memory.stat # show various statistics memory.use_hierarchy # set/show hierarchical account enabled memory.force_empty # trigger forced move charge to parent + memory.pressure_level # set memory pressure notifications memory.swappiness # set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) memory.move_charge_at_immigrate # set/show controls of moving charges @@ -762,7 +764,73 @@ At reading, current status of OOM is shown. under_oom 0 or 1 (if 1, the memory cgroup is under OOM, tasks may be stopped.) -11. TODO +11. Memory Pressure + +The pressure level notifications can be used to monitor the memory +allocation cost; based on the pressure, applications can implement +different strategies of managing their memory resources. The pressure +levels are defined as following: + +The "low" level means that the system is reclaiming memory for new +allocations. Monitoring this reclaiming activity might be useful for +maintaining cache level. Upon notification, the program (typically +"Activity Manager") might analyze vmstat and act in advance (i.e. +prematurely shutdown unimportant services). + +The "medium" level means that the system is experiencing medium memory +pressure, the system might be making swap, paging out active file caches, +etc. Upon this event applications may decide to further analyze +vmstat/zoneinfo/memcg or internal memory usage statistics and free any +resources that can be easily reconstructed or re-read from a disk. + +The "critical" level means that the system is actively thrashing, it is +about to out of memory (OOM) or even the in-kernel OOM killer is on its +way to trigger. Applications should do whatever they can to help the +system. It might be too late to consult with vmstat or any other +statistics, so it's advisable to take an immediate action. + +The events are propagated upward until the event is handled, i.e. the +events are not pass-through. Here is what this means: for example you have +three cgroups: A->B->C. Now you set up an event listener on cgroups A, B +and C, and suppose group C experiences some pressure. In this situation, +only group C will receive the notification, i.e. groups A and B will not +receive it. This is done to avoid excessive "broadcasting" of messages, +which disturbs the system and which is especially bad if we are low on +memory or thrashing. So, organize the cgroups wisely, or propagate the +events manually (or, ask us to implement the pass-through events, +explaining why would you need them.) + +The file memory.pressure_level is only used to setup an eventfd. To +register a notification, an application must: + +- create an eventfd using eventfd(2); +- open memory.pressure_level; +- write string like " " + to cgroup.event_control. + +Application will be notified through eventfd when memory pressure is at +the specific level (or higher). Read/write operations to +memory.pressure_level are no implemented. + +Test: + + Here is a small script example that makes a new cgroup, sets up a + memory limit, sets up a notification in the cgroup and then makes child + cgroup experience a critical pressure: + + # cd /sys/fs/cgroup/memory/ + # mkdir foo + # cd foo + # cgroup_event_listener memory.pressure_level low & + # echo 8000000 > memory.limit_in_bytes + # echo 8000000 > memory.memsw.limit_in_bytes + # echo $$ > tasks + # dd if=/dev/zero | read x + + (Expect a bunch of notifications, and eventually, the oom-killer will + trigger.) + +12. TODO 1. Add support for accounting huge pages (as a separate controller) 2. Make per-cgroup scanner reclaim not-shared pages first diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h new file mode 100644 index 000000000000..76be077340ea --- /dev/null +++ b/include/linux/vmpressure.h @@ -0,0 +1,47 @@ +#ifndef __LINUX_VMPRESSURE_H +#define __LINUX_VMPRESSURE_H + +#include +#include +#include +#include +#include +#include + +struct vmpressure { + unsigned long scanned; + unsigned long reclaimed; + /* The lock is used to keep the scanned/reclaimed above in sync. */ + struct mutex sr_lock; + + /* The list of vmpressure_event structs. */ + struct list_head events; + /* Have to grab the lock on events traversal or modifications. */ + struct mutex events_lock; + + struct work_struct work; +}; + +struct mem_cgroup; + +#ifdef CONFIG_MEMCG +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed); +extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); + +extern void vmpressure_init(struct vmpressure *vmpr); +extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); +extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); +extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); +extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd, + const char *args); +extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd); +#else +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) {} +static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, + int prio) {} +#endif /* CONFIG_MEMCG */ +#endif /* __LINUX_VMPRESSURE_H */ diff --git a/mm/Makefile b/mm/Makefile index 3a4628751f89..72c5acb9345f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o -obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7e5bc43c2d1f..360464f40e96 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -261,6 +262,9 @@ struct mem_cgroup { */ struct res_counter res; + /* vmpressure notifications */ + struct vmpressure vmpressure; + union { /* * the counter to account for mem+swap usage. @@ -359,6 +363,7 @@ struct mem_cgroup { atomic_t numainfo_events; atomic_t numainfo_updating; #endif + /* * Per cgroup active and inactive list, similar to the * per zone LRU lists. @@ -510,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) return container_of(s, struct mem_cgroup, css); } +/* Some nice accessors for the vmpressure. */ +struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) +{ + if (!memcg) + memcg = root_mem_cgroup; + return &memcg->vmpressure; +} + +struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) +{ + return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; +} + +struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) +{ + return &mem_cgroup_from_css(css)->vmpressure; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); @@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = { .unregister_event = mem_cgroup_oom_unregister_event, .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), }, + { + .name = "pressure_level", + .register_event = vmpressure_register_event, + .unregister_event = vmpressure_unregister_event, + }, #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); + vmpressure_init(&memcg->vmpressure); return &memcg->css; diff --git a/mm/vmpressure.c b/mm/vmpressure.c new file mode 100644 index 000000000000..736a6011c2c8 --- /dev/null +++ b/mm/vmpressure.c @@ -0,0 +1,374 @@ +/* + * Linux VM pressure + * + * Copyright 2012 Linaro Ltd. + * Anton Vorontsov + * + * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro, + * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The window size (vmpressure_win) is the number of scanned pages before + * we try to analyze scanned/reclaimed ratio. So the window is used as a + * rate-limit tunable for the "low" level notification, and also for + * averaging the ratio for medium/critical levels. Using small window + * sizes can cause lot of false positives, but too big window size will + * delay the notifications. + * + * As the vmscan reclaimer logic works with chunks which are multiple of + * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well. + * + * TODO: Make the window size depend on machine size, as we do for vmstat + * thresholds. Currently we set it to 512 pages (2MB for 4KB pages). + */ +static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + +/* + * These thresholds are used when we account memory pressure through + * scanned/reclaimed ratio. The current values were chosen empirically. In + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ +static const unsigned int vmpressure_level_med = 60; +static const unsigned int vmpressure_level_critical = 95; + +/* + * When there are too little pages left to scan, vmpressure() may miss the + * critical pressure as number of pages will be less than "window size". + * However, in that case the vmscan priority will raise fast as the + * reclaimer will try to scan LRUs more deeply. + * + * The vmscan logic considers these special priorities: + * + * prio == DEF_PRIORITY (12): reclaimer starts with that value + * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed + * prio == 0 : close to OOM, kernel scans every page in an lru + * + * Any value in this range is acceptable for this tunable (i.e. from 12 to + * 0). Current value for the vmpressure_level_critical_prio is chosen + * empirically, but the number, in essence, means that we consider + * critical level when scanning depth is ~10% of the lru size (vmscan + * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one + * eights). + */ +static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10); + +static struct vmpressure *work_to_vmpressure(struct work_struct *work) +{ + return container_of(work, struct vmpressure, work); +} + +static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) +{ + return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); +} + +static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) +{ + struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); + + memcg = parent_mem_cgroup(memcg); + if (!memcg) + return NULL; + return memcg_to_vmpressure(memcg); +} + +enum vmpressure_levels { + VMPRESSURE_LOW = 0, + VMPRESSURE_MEDIUM, + VMPRESSURE_CRITICAL, + VMPRESSURE_NUM_LEVELS, +}; + +static const char * const vmpressure_str_levels[] = { + [VMPRESSURE_LOW] = "low", + [VMPRESSURE_MEDIUM] = "medium", + [VMPRESSURE_CRITICAL] = "critical", +}; + +static enum vmpressure_levels vmpressure_level(unsigned long pressure) +{ + if (pressure >= vmpressure_level_critical) + return VMPRESSURE_CRITICAL; + else if (pressure >= vmpressure_level_med) + return VMPRESSURE_MEDIUM; + return VMPRESSURE_LOW; +} + +static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned, + unsigned long reclaimed) +{ + unsigned long scale = scanned + reclaimed; + unsigned long pressure; + + /* + * We calculate the ratio (in percents) of how many pages were + * scanned vs. reclaimed in a given time frame (window). Note that + * time is in VM reclaimer's "ticks", i.e. number of pages + * scanned. This makes it possible to set desired reaction time + * and serves as a ratelimit. + */ + pressure = scale - (reclaimed * scale / scanned); + pressure = pressure * 100 / scale; + + pr_debug("%s: %3lu (s: %lu r: %lu)\n", __func__, pressure, + scanned, reclaimed); + + return vmpressure_level(pressure); +} + +struct vmpressure_event { + struct eventfd_ctx *efd; + enum vmpressure_levels level; + struct list_head node; +}; + +static bool vmpressure_event(struct vmpressure *vmpr, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure_event *ev; + enum vmpressure_levels level; + bool signalled = false; + + level = vmpressure_calc_level(scanned, reclaimed); + + mutex_lock(&vmpr->events_lock); + + list_for_each_entry(ev, &vmpr->events, node) { + if (level >= ev->level) { + eventfd_signal(ev->efd, 1); + signalled = true; + } + } + + mutex_unlock(&vmpr->events_lock); + + return signalled; +} + +static void vmpressure_work_fn(struct work_struct *work) +{ + struct vmpressure *vmpr = work_to_vmpressure(work); + unsigned long scanned; + unsigned long reclaimed; + + /* + * Several contexts might be calling vmpressure(), so it is + * possible that the work was rescheduled again before the old + * work context cleared the counters. In that case we will run + * just after the old work returns, but then scanned might be zero + * here. No need for any locks here since we don't care if + * vmpr->reclaimed is in sync. + */ + if (!vmpr->scanned) + return; + + mutex_lock(&vmpr->sr_lock); + scanned = vmpr->scanned; + reclaimed = vmpr->reclaimed; + vmpr->scanned = 0; + vmpr->reclaimed = 0; + mutex_unlock(&vmpr->sr_lock); + + do { + if (vmpressure_event(vmpr, scanned, reclaimed)) + break; + /* + * If not handled, propagate the event upward into the + * hierarchy. + */ + } while ((vmpr = vmpressure_parent(vmpr))); +} + +/** + * vmpressure() - Account memory pressure through scanned/reclaimed ratio + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @scanned: number of pages scanned + * @reclaimed: number of pages reclaimed + * + * This function should be called from the vmscan reclaim path to account + * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw + * pressure index is then further refined and averaged over time. + * + * This function does not return any value. + */ +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, + unsigned long scanned, unsigned long reclaimed) +{ + struct vmpressure *vmpr = memcg_to_vmpressure(memcg); + + /* + * Here we only want to account pressure that userland is able to + * help us with. For example, suppose that DMA zone is under + * pressure; if we notify userland about that kind of pressure, + * then it will be mostly a waste as it will trigger unnecessary + * freeing of memory by userland (since userland is more likely to + * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That + * is why we include only movable, highmem and FS/IO pages. + * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so + * we account it too. + */ + if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS))) + return; + + /* + * If we got here with no pages scanned, then that is an indicator + * that reclaimer was unable to find any shrinkable LRUs at the + * current scanning depth. But it does not mean that we should + * report the critical pressure, yet. If the scanning priority + * (scanning depth) goes too high (deep), we will be notified + * through vmpressure_prio(). But so far, keep calm. + */ + if (!scanned) + return; + + mutex_lock(&vmpr->sr_lock); + vmpr->scanned += scanned; + vmpr->reclaimed += reclaimed; + scanned = vmpr->scanned; + mutex_unlock(&vmpr->sr_lock); + + if (scanned < vmpressure_win || work_pending(&vmpr->work)) + return; + schedule_work(&vmpr->work); +} + +/** + * vmpressure_prio() - Account memory pressure through reclaimer priority level + * @gfp: reclaimer's gfp mask + * @memcg: cgroup memory controller handle + * @prio: reclaimer's priority + * + * This function should be called from the reclaim path every time when + * the vmscan's reclaiming priority (scanning depth) changes. + * + * This function does not return any value. + */ +void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) +{ + /* + * We only use prio for accounting critical level. For more info + * see comment for vmpressure_level_critical_prio variable above. + */ + if (prio > vmpressure_level_critical_prio) + return; + + /* + * OK, the prio is below the threshold, updating vmpressure + * information before shrinker dives into long shrinking of long + * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0 + * to the vmpressure() basically means that we signal 'critical' + * level. + */ + vmpressure(gfp, memcg, vmpressure_win, 0); +} + +/** + * vmpressure_register_event() - Bind vmpressure notifications to an eventfd + * @cg: cgroup that is interested in vmpressure notifications + * @cft: cgroup control files handle + * @eventfd: eventfd context to link notifications with + * @args: event arguments (used to set up a pressure level threshold) + * + * This function associates eventfd context with the vmpressure + * infrastructure, so that the notifications will be delivered to the + * @eventfd. The @args parameter is a string that denotes pressure level + * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or + * "critical"). + * + * This function should not be used directly, just pass it to (struct + * cftype).register_event, and then cgroup core will handle everything by + * itself. + */ +int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd, const char *args) +{ + struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure_event *ev; + int level; + + for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) { + if (!strcmp(vmpressure_str_levels[level], args)) + break; + } + + if (level >= VMPRESSURE_NUM_LEVELS) + return -EINVAL; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) + return -ENOMEM; + + ev->efd = eventfd; + ev->level = level; + + mutex_lock(&vmpr->events_lock); + list_add(&ev->node, &vmpr->events); + mutex_unlock(&vmpr->events_lock); + + return 0; +} + +/** + * vmpressure_unregister_event() - Unbind eventfd from vmpressure + * @cg: cgroup handle + * @cft: cgroup control files handle + * @eventfd: eventfd context that was used to link vmpressure with the @cg + * + * This function does internal manipulations to detach the @eventfd from + * the vmpressure notifications, and then frees internal resources + * associated with the @eventfd (but the @eventfd itself is not freed). + * + * This function should not be used directly, just pass it to (struct + * cftype).unregister_event, and then cgroup core will handle everything + * by itself. + */ +void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, + struct eventfd_ctx *eventfd) +{ + struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure_event *ev; + + mutex_lock(&vmpr->events_lock); + list_for_each_entry(ev, &vmpr->events, node) { + if (ev->efd != eventfd) + continue; + list_del(&ev->node); + kfree(ev); + break; + } + mutex_unlock(&vmpr->events_lock); +} + +/** + * vmpressure_init() - Initialize vmpressure control structure + * @vmpr: Structure to be initialized + * + * This function should be called on every allocated vmpressure structure + * before any usage. + */ +void vmpressure_init(struct vmpressure *vmpr) +{ + mutex_init(&vmpr->sr_lock); + mutex_init(&vmpr->events_lock); + INIT_LIST_HEAD(&vmpr->events); + INIT_WORK(&vmpr->work, vmpressure_work_fn); +} diff --git a/mm/vmscan.c b/mm/vmscan.c index e03a00b09da9..e53e49584cf3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) } memcg = mem_cgroup_iter(root, memcg, &reclaim); } while (memcg); + + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, + sc->nr_scanned - nr_scanned, + sc->nr_reclaimed - nr_reclaimed); + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); } @@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, count_vm_event(ALLOCSTALL); do { + vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, + sc->priority); sc->nr_scanned = 0; aborted_reclaim = shrink_zones(zonelist, sc); -- cgit v1.2.3 From fd0ccaf2bd04e54d2a6979fbfdcad856694e3877 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 29 Apr 2013 15:08:43 -0700 Subject: memcg: avoid accessing memcg after releasing reference This might cause a use-after-free bug. Signed-off-by: Li Zefan Cc: Glauber Costa Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 360464f40e96..c92bcfc5466e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3215,12 +3215,12 @@ void memcg_release_cache(struct kmem_cache *s) root = s->memcg_params->root_cache; root->memcg_params->memcg_caches[id] = NULL; - mem_cgroup_put(memcg); mutex_lock(&memcg->slab_caches_mutex); list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); + mem_cgroup_put(memcg); out: kfree(s->memcg_params); } -- cgit v1.2.3 From 465adcf1ea7b2e49b2e0899366624f5532b64012 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 29 Apr 2013 15:08:45 -0700 Subject: mm, memcg: give exiting processes access to memory reserves A memcg may livelock when oom if the process that grabs the hierarchy's oom lock is never the first process with PF_EXITING set in the memcg's task iteration. The oom killer, both global and memcg, will defer if it finds an eligible process that is in the process of exiting and it is not being ptraced. The idea is to allow it to exit without using memory reserves before needlessly killing another process. This normally works fine except in the memcg case with a large number of threads attached to the oom memcg. In this case, the memcg oom killer only gets called for the process that grabs the hierarchy's oom lock; all others end up blocked on the memcg's oom waitqueue. Thus, if the process that grabs the hierarchy's oom lock is never the first PF_EXITING process in the memcg's task iteration, the oom killer is constantly deferred without anything making progress. The fix is to give PF_EXITING processes access to memory reserves so that we've marked them as oom killed without any iteration. This allows __mem_cgroup_try_charge() to succeed so that the process may exit. This makes the memcg oom killer exemption for TIF_MEMDIE tasks, now immediately granted for processes with pending SIGKILLs and those in the exit path, to be equivalent to what is done for the global oom killer. Signed-off-by: David Rientjes Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c92bcfc5466e..47b36fea7e1f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1787,11 +1787,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct task_struct *chosen = NULL; /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. */ - if (fatal_signal_pending(current)) { + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { set_thread_flag(TIF_MEMDIE); return; } -- cgit v1.2.3 From ca0dde97178e75ed1370b8616326f5496a803d65 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 29 Apr 2013 15:08:57 -0700 Subject: memcg: take reference before releasing rcu_read_lock The memcg is not referenced, so it can be destroyed at anytime right after we exit rcu read section, so it's not safe to access it. To fix this, we call css_tryget() to get a reference while we're still in rcu read section. This also removes a bogus comment above __memcg_create_cache_enqueue(). Signed-off-by: Li Zefan Acked-by: Glauber Costa Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 63 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'mm/memcontrol.c') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 47b36fea7e1f..b8dc8e4cbf6a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3483,7 +3483,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. - * Called with rcu_read_lock. */ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -3491,12 +3490,8 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, struct create_work *cw; cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); - if (cw == NULL) - return; - - /* The corresponding put will be done in the workqueue. */ - if (!css_tryget(&memcg->css)) { - kfree(cw); + if (cw == NULL) { + css_put(&memcg->css); return; } @@ -3552,10 +3547,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); - rcu_read_unlock(); if (!memcg_can_account_kmem(memcg)) - return cachep; + goto out; idx = memcg_cache_id(memcg); @@ -3564,29 +3558,38 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * code updating memcg_caches will issue a write barrier to match this. */ read_barrier_depends(); - if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) { - /* - * If we are in a safe context (can wait, and not in interrupt - * context), we could be be predictable and return right away. - * This would guarantee that the allocation being performed - * already belongs in the new cache. - * - * However, there are some clashes that can arrive from locking. - * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. - */ - memcg_create_cache_enqueue(memcg, cachep); - return cachep; + if (likely(cachep->memcg_params->memcg_caches[idx])) { + cachep = cachep->memcg_params->memcg_caches[idx]; + goto out; } - return cachep->memcg_params->memcg_caches[idx]; + /* The corresponding put will be done in the workqueue. */ + if (!css_tryget(&memcg->css)) + goto out; + rcu_read_unlock(); + + /* + * If we are in a safe context (can wait, and not in interrupt + * context), we could be be predictable and return right away. + * This would guarantee that the allocation being performed + * already belongs in the new cache. + * + * However, there are some clashes that can arrive from locking. + * For instance, because we acquire the slab_mutex while doing + * kmem_cache_dup, this means no further allocation could happen + * with the slab_mutex held. + * + * Also, because cache creation issue get_online_cpus(), this + * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, + * that ends up reversed during cpu hotplug. (cpuset allocates + * a bunch of GFP_KERNEL memory during cpuup). Due to all that, + * better to defer everything. + */ + memcg_create_cache_enqueue(memcg, cachep); + return cachep; +out: + rcu_read_unlock(); + return cachep; } EXPORT_SYMBOL(__memcg_kmem_get_cache); -- cgit v1.2.3