From b4098bfc5efb1fd7ecf40165132a1283aeea3500 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Jul 2019 16:54:10 +0200 Subject: sched/deadline: Impose global limits on sched_attr::sched_period Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20190726161357.397880775@infradead.org --- kernel/sysctl.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel/sysctl.c') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index db1ce7af2563..4aea67d3d552 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1779,6 +1779,20 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sched_rt_handler, }, + { + .procname = "sched_deadline_period_max_us", + .data = &sysctl_sched_dl_period_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_deadline_period_min_us", + .data = &sysctl_sched_dl_period_min, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_rr_timeslice_ms", .data = &sysctl_sched_rr_timeslice, -- cgit v1.2.3 From 13685c4a08fca9dd76bf53bfcbadc044ab2a08cb Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Thu, 16 Jul 2020 12:03:45 +0100 Subject: sched/uclamp: Add a new sysctl to control RT default boost value RT tasks by default run at the highest capacity/performance level. When uclamp is selected this default behavior is retained by enforcing the requested uclamp.min (p->uclamp_req[UCLAMP_MIN]) of the RT tasks to be uclamp_none(UCLAMP_MAX), which is SCHED_CAPACITY_SCALE; the maximum value. This is also referred to as 'the default boost value of RT tasks'. See commit 1a00d999971c ("sched/uclamp: Set default clamps for RT tasks"). On battery powered devices, it is desired to control this default (currently hardcoded) behavior at runtime to reduce energy consumed by RT tasks. For example, a mobile device manufacturer where big.LITTLE architecture is dominant, the performance of the little cores varies across SoCs, and on high end ones the big cores could be too power hungry. Given the diversity of SoCs, the new knob allows manufactures to tune the best performance/power for RT tasks for the particular hardware they run on. They could opt to further tune the value when the user selects a different power saving mode or when the device is actively charging. The runtime aspect of it further helps in creating a single kernel image that can be run on multiple devices that require different tuning. Keep in mind that a lot of RT tasks in the system are created by the kernel. On Android for instance I can see over 50 RT tasks, only a handful of which created by the Android framework. To control the default behavior globally by system admins and device integrator, introduce the new sysctl_sched_uclamp_util_min_rt_default to change the default boost value of the RT tasks. I anticipate this to be mostly in the form of modifying the init script of a particular device. To avoid polluting the fast path with unnecessary code, the approach taken is to synchronously do the update by traversing all the existing tasks in the system. This could race with a concurrent fork(), which is dealt with by introducing sched_post_fork() function which will ensure the racy fork will get the right update applied. Tested on Juno-r2 in combination with the RT capacity awareness [1]. By default an RT task will go to the highest capacity CPU and run at the maximum frequency, which is particularly energy inefficient on high end mobile devices because the biggest core[s] are 'huge' and power hungry. With this patch the RT task can be controlled to run anywhere by default, and doesn't cause the frequency to be maximum all the time. Yet any task that really needs to be boosted can easily escape this default behavior by modifying its requested uclamp.min value (p->uclamp_req[UCLAMP_MIN]) via sched_setattr() syscall. [1] 804d402fb6f6: ("sched/rt: Make RT capacity-aware") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200716110347.19553-2-qais.yousef@arm.com --- include/linux/sched.h | 10 +++- include/linux/sched/sysctl.h | 1 + include/linux/sched/task.h | 1 + kernel/fork.c | 1 + kernel/sched/core.c | 119 ++++++++++++++++++++++++++++++++++++++++--- kernel/sysctl.c | 7 +++ 6 files changed, 131 insertions(+), 8 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index adf0125190d4..a6bf77c34687 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -686,9 +686,15 @@ struct task_struct { struct sched_dl_entity dl; #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ + /* + * Clamp values requested for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp_req[UCLAMP_CNT]; - /* Effective clamp values used for a scheduling entity */ + /* + * Effective clamp values used for a scheduling entity. + * Must be updated with task_rq_lock() held. + */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 24be30a40814..3c31ba88aca5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -67,6 +67,7 @@ extern unsigned int sysctl_sched_dl_period_min; #ifdef CONFIG_UCLAMP_TASK extern unsigned int sysctl_sched_uclamp_util_min; extern unsigned int sysctl_sched_uclamp_util_max; +extern unsigned int sysctl_sched_uclamp_util_min_rt_default; #endif #ifdef CONFIG_CFS_BANDWIDTH diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 38359071236a..e7ddab095baf 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -55,6 +55,7 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index efc5493203ae..e75c2e41f3d1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2304,6 +2304,7 @@ static __latent_entropy struct task_struct *copy_process( write_unlock_irq(&tasklist_lock); proc_fork_connector(p); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e44d83f3e0e6..12e1f3a2cabc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -889,6 +889,23 @@ unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; /* Max allowed maximum utilization */ unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; +/* + * By default RT tasks run at the maximum performance point/capacity of the + * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to + * SCHED_CAPACITY_SCALE. + * + * This knob allows admins to change the default behavior when uclamp is being + * used. In battery powered devices, particularly, running at the maximum + * capacity and frequency will increase energy consumption and shorten the + * battery life. + * + * This knob only affects RT tasks that their uclamp_se->user_defined == false. + * + * This knob will not override the system default sched_util_clamp_min defined + * above. + */ +unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE; + /* All clamps are required to be less or equal than these values */ static struct uclamp_se uclamp_default[UCLAMP_CNT]; @@ -991,6 +1008,64 @@ unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, return uclamp_idle_value(rq, clamp_id, clamp_value); } +static void __uclamp_update_util_min_rt_default(struct task_struct *p) +{ + unsigned int default_util_min; + struct uclamp_se *uc_se; + + lockdep_assert_held(&p->pi_lock); + + uc_se = &p->uclamp_req[UCLAMP_MIN]; + + /* Only sync if user didn't override the default */ + if (uc_se->user_defined) + return; + + default_util_min = sysctl_sched_uclamp_util_min_rt_default; + uclamp_se_set(uc_se, default_util_min, false); +} + +static void uclamp_update_util_min_rt_default(struct task_struct *p) +{ + struct rq_flags rf; + struct rq *rq; + + if (!rt_task(p)) + return; + + /* Protect updates to p->uclamp_* */ + rq = task_rq_lock(p, &rf); + __uclamp_update_util_min_rt_default(p); + task_rq_unlock(rq, p, &rf); +} + +static void uclamp_sync_util_min_rt_default(void) +{ + struct task_struct *g, *p; + + /* + * copy_process() sysctl_uclamp + * uclamp_min_rt = X; + * write_lock(&tasklist_lock) read_lock(&tasklist_lock) + * // link thread smp_mb__after_spinlock() + * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock); + * sched_post_fork() for_each_process_thread() + * __uclamp_sync_rt() __uclamp_sync_rt() + * + * Ensures that either sched_post_fork() will observe the new + * uclamp_min_rt or for_each_process_thread() will observe the new + * task. + */ + read_lock(&tasklist_lock); + smp_mb__after_spinlock(); + read_unlock(&tasklist_lock); + + rcu_read_lock(); + for_each_process_thread(g, p) + uclamp_update_util_min_rt_default(p); + rcu_read_unlock(); +} + static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id) { @@ -1278,12 +1353,13 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; - int old_min, old_max; + int old_min, old_max, old_min_rt; int result; mutex_lock(&uclamp_mutex); old_min = sysctl_sched_uclamp_util_min; old_max = sysctl_sched_uclamp_util_max; + old_min_rt = sysctl_sched_uclamp_util_min_rt_default; result = proc_dointvec(table, write, buffer, lenp, ppos); if (result) @@ -1292,7 +1368,9 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, goto done; if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || - sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { + sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE || + sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) { + result = -EINVAL; goto undo; } @@ -1313,6 +1391,11 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, uclamp_update_root_tg(); } + if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) { + static_branch_enable(&sched_uclamp_used); + uclamp_sync_util_min_rt_default(); + } + /* * We update all RUNNABLE tasks only when task groups are in use. * Otherwise, keep it simple and do just a lazy update at each next @@ -1324,6 +1407,7 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, undo: sysctl_sched_uclamp_util_min = old_min; sysctl_sched_uclamp_util_max = old_max; + sysctl_sched_uclamp_util_min_rt_default = old_min_rt; done: mutex_unlock(&uclamp_mutex); @@ -1369,17 +1453,20 @@ static void __setscheduler_uclamp(struct task_struct *p, */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; - unsigned int clamp_value = uclamp_none(clamp_id); /* Keep using defined clamps across class changes */ if (uc_se->user_defined) continue; - /* By default, RT tasks always get 100% boost */ + /* + * RT by default have a 100% boost value that could be modified + * at runtime. + */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - clamp_value = uclamp_none(UCLAMP_MAX); + __uclamp_update_util_min_rt_default(p); + else + uclamp_se_set(uc_se, uclamp_none(clamp_id), false); - uclamp_se_set(uc_se, clamp_value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) @@ -1400,6 +1487,10 @@ static void uclamp_fork(struct task_struct *p) { enum uclamp_id clamp_id; + /* + * We don't need to hold task_rq_lock() when updating p->uclamp_* here + * as the task is still at its early fork stages. + */ for_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false; @@ -1412,6 +1503,11 @@ static void uclamp_fork(struct task_struct *p) } } +static void uclamp_post_fork(struct task_struct *p) +{ + uclamp_update_util_min_rt_default(p); +} + static void __init init_uclamp_rq(struct rq *rq) { enum uclamp_id clamp_id; @@ -1462,6 +1558,7 @@ static inline int uclamp_validate(struct task_struct *p, static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { } static inline void uclamp_fork(struct task_struct *p) { } +static inline void uclamp_post_fork(struct task_struct *p) { } static inline void init_uclamp(void) { } #endif /* CONFIG_UCLAMP_TASK */ @@ -3205,6 +3302,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } +void sched_post_fork(struct task_struct *p) +{ + uclamp_post_fork(p); +} + unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) @@ -5724,6 +5826,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, kattr.sched_nice = task_nice(p); #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4aea67d3d552..1b4d2dc270a5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1815,6 +1815,13 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = sysctl_sched_uclamp_handler, }, + { + .procname = "sched_util_clamp_min_rt_default", + .data = &sysctl_sched_uclamp_util_min_rt_default, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_sched_uclamp_handler, + }, #endif #ifdef CONFIG_SCHED_AUTOGROUP { -- cgit v1.2.3 From 56f3547bfa4d361148aa748ccb86073bc57f5e6c Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Thu, 6 Aug 2020 23:23:15 -0700 Subject: mm: adjust vm_committed_as_batch according to vm overcommit policy When checking a performance change for will-it-scale scalability mmap test [1], we found very high lock contention for spinlock of percpu counter 'vm_committed_as': 94.14% 0.35% [kernel.kallsyms] [k] _raw_spin_lock_irqsave 48.21% _raw_spin_lock_irqsave;percpu_counter_add_batch;__vm_enough_memory;mmap_region;do_mmap; 45.91% _raw_spin_lock_irqsave;percpu_counter_add_batch;__do_munmap; Actually this heavy lock contention is not always necessary. The 'vm_committed_as' needs to be very precise when the strict OVERCOMMIT_NEVER policy is set, which requires a rather small batch number for the percpu counter. So keep 'batch' number unchanged for strict OVERCOMMIT_NEVER policy, and lift it to 64X for OVERCOMMIT_ALWAYS and OVERCOMMIT_GUESS policies. Also add a sysctl handler to adjust it when the policy is reconfigured. Benchmark with the same testcase in [1] shows 53% improvement on a 8C/16T desktop, and 2097%(20X) on a 4S/72C/144T server. We tested with test platforms in 0day (server, desktop and laptop), and 80%+ platforms shows improvements with that test. And whether it shows improvements depends on if the test mmap size is bigger than the batch number computed. And if the lift is 16X, 1/3 of the platforms will show improvements, though it should help the mmap/unmap usage generally, as Michal Hocko mentioned: : I believe that there are non-synthetic worklaods which would benefit from : a larger batch. E.g. large in memory databases which do large mmaps : during startups from multiple threads. [1] https://lore.kernel.org/lkml/20200305062138.GI5972@shao2-debian/ Signed-off-by: Feng Tang Signed-off-by: Andrew Morton Acked-by: Michal Hocko Cc: Matthew Wilcox (Oracle) Cc: Johannes Weiner Cc: Mel Gorman Cc: Qian Cai Cc: Kees Cook Cc: Andi Kleen Cc: Tim Chen Cc: Dave Hansen Cc: Huang Ying Cc: Christoph Lameter Cc: Dennis Zhou Cc: Haiyang Zhang Cc: kernel test robot Cc: "K. Y. Srinivasan" Cc: Tejun Heo Link: http://lkml.kernel.org/r/1589611660-89854-4-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1592725000-73486-4-git-send-email-feng.tang@intel.com Link: http://lkml.kernel.org/r/1594389708-60781-5-git-send-email-feng.tang@intel.com Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ include/linux/mman.h | 4 ++++ kernel/sysctl.c | 2 +- mm/mm_init.c | 22 ++++++++++++++++------ mm/util.c | 41 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 64 insertions(+), 7 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2830f1c0fdc3..1c3470550395 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -206,6 +206,8 @@ int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) diff --git a/include/linux/mman.h b/include/linux/mman.h index 4b08e9c9c538..6f34c33075f9 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -57,8 +57,12 @@ extern struct percpu_counter vm_committed_as; #ifdef CONFIG_SMP extern s32 vm_committed_as_batch; +extern void mm_compute_batch(int overcommit_policy); #else #define vm_committed_as_batch 0 +static inline void mm_compute_batch(int overcommit_policy) +{ +} #endif unsigned long vm_memory_committed(void); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1b4d2dc270a5..f785de3caac0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2671,7 +2671,7 @@ static struct ctl_table vm_table[] = { .data = &sysctl_overcommit_memory, .maxlen = sizeof(sysctl_overcommit_memory), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = overcommit_policy_handler, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, diff --git a/mm/mm_init.c b/mm/mm_init.c index 435e5f794b3b..b06a30fbedff 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -144,14 +145,23 @@ EXPORT_SYMBOL_GPL(mm_kobj); #ifdef CONFIG_SMP s32 vm_committed_as_batch = 32; -static void __meminit mm_compute_batch(void) +void mm_compute_batch(int overcommit_policy) { u64 memsized_batch; s32 nr = num_present_cpus(); s32 batch = max_t(s32, nr*2, 32); - - /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ - memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff); + unsigned long ram_pages = totalram_pages(); + + /* + * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of + * (total memory/#cpus), and lift it to 25% for other policies + * to easy the possible lock contention for percpu_counter + * vm_committed_as, while the max limit is INT_MAX + */ + if (overcommit_policy == OVERCOMMIT_NEVER) + memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX); + else + memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX); vm_committed_as_batch = max_t(s32, memsized_batch, batch); } @@ -162,7 +172,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self, switch (action) { case MEM_ONLINE: case MEM_OFFLINE: - mm_compute_batch(); + mm_compute_batch(sysctl_overcommit_memory); default: break; } @@ -176,7 +186,7 @@ static struct notifier_block compute_batch_nb __meminitdata = { static int __init mm_compute_batch_init(void) { - mm_compute_batch(); + mm_compute_batch(sysctl_overcommit_memory); register_hotmemory_notifier(&compute_batch_nb); return 0; diff --git a/mm/util.c b/mm/util.c index 1c9d097d7e48..8d6280c05238 100644 --- a/mm/util.c +++ b/mm/util.c @@ -746,6 +746,47 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, return ret; } +static void sync_overcommit_as(struct work_struct *dummy) +{ + percpu_counter_sync(&vm_committed_as); +} + +int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int new_policy; + int ret; + + /* + * The deviation of sync_overcommit_as could be big with loose policy + * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to + * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply + * with the strict "NEVER", and to avoid possible race condtion (even + * though user usually won't too frequently do the switching to policy + * OVERCOMMIT_NEVER), the switch is done in the following order: + * 1. changing the batch + * 2. sync percpu count on each CPU + * 3. switch the policy + */ + if (write) { + t = *table; + t.data = &new_policy; + ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (ret) + return ret; + + mm_compute_batch(new_policy); + if (new_policy == OVERCOMMIT_NEVER) + schedule_on_each_cpu(sync_overcommit_as); + sysctl_overcommit_memory = new_policy; + } else { + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + } + + return ret; +} + int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { -- cgit v1.2.3 From facdaa917c4d5a376d09d25865f5a863f906234a Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Tue, 11 Aug 2020 18:31:00 -0700 Subject: mm: proactive compaction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For some applications, we need to allocate almost all memory as hugepages. However, on a running system, higher-order allocations can fail if the memory is fragmented. Linux kernel currently does on-demand compaction as we request more hugepages, but this style of compaction incurs very high latency. Experiments with one-time full memory compaction (followed by hugepage allocations) show that kernel is able to restore a highly fragmented memory state to a fairly compacted memory state within <1 sec for a 32G system. Such data suggests that a more proactive compaction can help us allocate a large fraction of memory as hugepages keeping allocation latencies low. For a more proactive compaction, the approach taken here is to define a new sysctl called 'vm.compaction_proactiveness' which dictates bounds for external fragmentation which kcompactd tries to maintain. The tunable takes a value in range [0, 100], with a default of 20. Note that a previous version of this patch [1] was found to introduce too many tunables (per-order extfrag{low, high}), but this one reduces them to just one sysctl. Also, the new tunable is an opaque value instead of asking for specific bounds of "external fragmentation", which would have been difficult to estimate. The internal interpretation of this opaque value allows for future fine-tuning. Currently, we use a simple translation from this tunable to [low, high] "fragmentation score" thresholds (low=100-proactiveness, high=low+10%). The score for a node is defined as weighted mean of per-zone external fragmentation. A zone's present_pages determines its weight. To periodically check per-node score, we reuse per-node kcompactd threads, which are woken up every 500 milliseconds to check the same. If a node's score exceeds its high threshold (as derived from user-provided proactiveness value), proactive compaction is started until its score reaches its low threshold value. By default, proactiveness is set to 20, which implies threshold values of low=80 and high=90. This patch is largely based on ideas from Michal Hocko [2]. See also the LWN article [3]. Performance data ================ System: x64_64, 1T RAM, 80 CPU threads. Kernel: 5.6.0-rc3 + this patch echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/defrag Before starting the driver, the system was fragmented from a userspace program that allocates all memory and then for each 2M aligned section, frees 3/4 of base pages using munmap. The workload is mainly anonymous userspace pages, which are easy to move around. I intentionally avoided unmovable pages in this test to see how much latency we incur when hugepage allocations hit direct compaction. 1. Kernel hugepage allocation latencies With the system in such a fragmented state, a kernel driver then allocates as many hugepages as possible and measures allocation latency: (all latency values are in microseconds) - With vanilla 5.6.0-rc3 percentile latency –––––––––– ––––––– 5 7894 10 9496 25 12561 30 15295 40 18244 50 21229 60 27556 75 30147 80 31047 90 32859 95 33799 Total 2M hugepages allocated = 383859 (749G worth of hugepages out of 762G total free => 98% of free memory could be allocated as hugepages) - With 5.6.0-rc3 + this patch, with proactiveness=20 sysctl -w vm.compaction_proactiveness=20 percentile latency –––––––––– ––––––– 5 2 10 2 25 3 30 3 40 3 50 4 60 4 75 4 80 4 90 5 95 429 Total 2M hugepages allocated = 384105 (750G worth of hugepages out of 762G total free => 98% of free memory could be allocated as hugepages) 2. JAVA heap allocation In this test, we first fragment memory using the same method as for (1). Then, we start a Java process with a heap size set to 700G and request the heap to be allocated with THP hugepages. We also set THP to madvise to allow hugepage backing of this heap. /usr/bin/time java -Xms700G -Xmx700G -XX:+UseTransparentHugePages -XX:+AlwaysPreTouch The above command allocates 700G of Java heap using hugepages. - With vanilla 5.6.0-rc3 17.39user 1666.48system 27:37.89elapsed - With 5.6.0-rc3 + this patch, with proactiveness=20 8.35user 194.58system 3:19.62elapsed Elapsed time remains around 3:15, as proactiveness is further increased. Note that proactive compaction happens throughout the runtime of these workloads. The situation of one-time compaction, sufficient to supply hugepages for following allocation stream, can probably happen for more extreme proactiveness values, like 80 or 90. In the above Java workload, proactiveness is set to 20. The test starts with a node's score of 80 or higher, depending on the delay between the fragmentation step and starting the benchmark, which gives more-or-less time for the initial round of compaction. As t he benchmark consumes hugepages, node's score quickly rises above the high threshold (90) and proactive compaction starts again, which brings down the score to the low threshold level (80). Repeat. bpftrace also confirms proactive compaction running 20+ times during the runtime of this Java benchmark. kcompactd threads consume 100% of one of the CPUs while it tries to bring a node's score within thresholds. Backoff behavior ================ Above workloads produce a memory state which is easy to compact. However, if memory is filled with unmovable pages, proactive compaction should essentially back off. To test this aspect: - Created a kernel driver that allocates almost all memory as hugepages followed by freeing first 3/4 of each hugepage. - Set proactiveness=40 - Note that proactive_compact_node() is deferred maximum number of times with HPAGE_FRAG_CHECK_INTERVAL_MSEC of wait between each check (=> ~30 seconds between retries). [1] https://patchwork.kernel.org/patch/11098289/ [2] https://lore.kernel.org/linux-mm/20161230131412.GI13301@dhcp22.suse.cz/ [3] https://lwn.net/Articles/817905/ Signed-off-by: Nitin Gupta Signed-off-by: Andrew Morton Tested-by: Oleksandr Natalenko Reviewed-by: Vlastimil Babka Reviewed-by: Khalid Aziz Reviewed-by: Oleksandr Natalenko Cc: Vlastimil Babka Cc: Khalid Aziz Cc: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Joonsoo Kim Cc: David Rientjes Cc: Nitin Gupta Cc: Oleksandr Natalenko Link: http://lkml.kernel.org/r/20200616204527.19185-1-nigupta@nvidia.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/sysctl/vm.rst | 15 +++ include/linux/compaction.h | 2 + kernel/sysctl.c | 9 ++ mm/compaction.c | 183 +++++++++++++++++++++++++++++++- mm/internal.h | 1 + mm/vmstat.c | 18 ++++ 6 files changed, 223 insertions(+), 5 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index d997cc3c26d0..4b9d2e8e9142 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -119,6 +119,21 @@ all zones are compacted such that free memory is available in contiguous blocks where possible. This can be important for example in the allocation of huge pages although processes will also directly compact memory as required. +compaction_proactiveness +======================== + +This tunable takes a value in the range [0, 100] with a default value of +20. This tunable determines how aggressively compaction is done in the +background. Setting it to 0 disables proactive compaction. + +Note that compaction has a non-trivial system-wide impact as pages +belonging to different processes are moved around, which could also lead +to latency spikes in unsuspecting applications. The kernel employs +various heuristics to avoid wasting CPU cycles if it detects that +proactive compaction is not being effective. + +Be careful when setting it to extreme values like 100, as that may +cause excessive background compaction activity. compact_unevictable_allowed =========================== diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 6fa0eea3f530..7a242d46454e 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -85,11 +85,13 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; +extern int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; +extern int extfrag_for_order(struct zone *zone, unsigned int order); extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f785de3caac0..ab72e52f8a7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2851,6 +2851,15 @@ static struct ctl_table vm_table[] = { .mode = 0200, .proc_handler = sysctl_compaction_handler, }, + { + .procname = "compaction_proactiveness", + .data = &sysctl_compaction_proactiveness, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred, + }, { .procname = "extfrag_threshold", .data = &sysctl_extfrag_threshold, diff --git a/mm/compaction.c b/mm/compaction.c index 86375605faa9..544a98811c82 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) #define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) +/* + * Fragmentation score check interval for proactive compaction purposes. + */ +static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; + +/* + * Page order with-respect-to which proactive compaction + * calculates external fragmentation, which is used as + * the "fragmentation score" of a node/zone. + */ +#if defined CONFIG_TRANSPARENT_HUGEPAGE +#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER +#elif defined HUGETLB_PAGE_ORDER +#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER +#else +#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) +#endif + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -1857,6 +1875,76 @@ static inline bool is_via_compact_memory(int order) return order == -1; } +static bool kswapd_is_running(pg_data_t *pgdat) +{ + return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING); +} + +/* + * A zone's fragmentation score is the external fragmentation wrt to the + * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value + * in the range [0, 100]. + * + * The scaling factor ensures that proactive compaction focuses on larger + * zones like ZONE_NORMAL, rather than smaller, specialized zones like + * ZONE_DMA32. For smaller zones, the score value remains close to zero, + * and thus never exceeds the high threshold for proactive compaction. + */ +static int fragmentation_score_zone(struct zone *zone) +{ + unsigned long score; + + score = zone->present_pages * + extfrag_for_order(zone, COMPACTION_HPAGE_ORDER); + return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); +} + +/* + * The per-node proactive (background) compaction process is started by its + * corresponding kcompactd thread when the node's fragmentation score + * exceeds the high threshold. The compaction process remains active till + * the node's score falls below the low threshold, or one of the back-off + * conditions is met. + */ +static int fragmentation_score_node(pg_data_t *pgdat) +{ + unsigned long score = 0; + int zoneid; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct zone *zone; + + zone = &pgdat->node_zones[zoneid]; + score += fragmentation_score_zone(zone); + } + + return score; +} + +static int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +{ + int wmark_low; + + /* + * Cap the low watermak to avoid excessive compaction + * activity in case a user sets the proactivess tunable + * close to 100 (maximum). + */ + wmark_low = max(100 - sysctl_compaction_proactiveness, 5); + return low ? wmark_low : min(wmark_low + 10, 100); +} + +static bool should_proactive_compact_node(pg_data_t *pgdat) +{ + int wmark_high; + + if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat)) + return false; + + wmark_high = fragmentation_score_wmark(pgdat, false); + return fragmentation_score_node(pgdat) > wmark_high; +} + static enum compact_result __compact_finished(struct compact_control *cc) { unsigned int order; @@ -1883,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_PARTIAL_SKIPPED; } + if (cc->proactive_compaction) { + int score, wmark_low; + pg_data_t *pgdat; + + pgdat = cc->zone->zone_pgdat; + if (kswapd_is_running(pgdat)) + return COMPACT_PARTIAL_SKIPPED; + + score = fragmentation_score_zone(cc->zone); + wmark_low = fragmentation_score_wmark(pgdat, true); + + if (score > wmark_low) + ret = COMPACT_CONTINUE; + else + ret = COMPACT_SUCCESS; + + goto out; + } + if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; @@ -1941,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) } } +out: if (cc->contended || fatal_signal_pending(current)) ret = COMPACT_CONTENDED; @@ -2421,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, return rc; } +/* + * Compact all zones within a node till each zone's fragmentation score + * reaches within proactive compaction thresholds (as determined by the + * proactiveness tunable). + * + * It is possible that the function returns before reaching score targets + * due to various back-off conditions, such as, contention on per-node or + * per-zone locks. + */ +static void proactive_compact_node(pg_data_t *pgdat) +{ + int zoneid; + struct zone *zone; + struct compact_control cc = { + .order = -1, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = true, + .whole_zone = true, + .gfp_mask = GFP_KERNEL, + .proactive_compaction = true, + }; + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + + compact_zone(&cc, NULL); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } +} /* Compact all zones within a node */ static void compact_node(int nid) @@ -2467,6 +2610,13 @@ static void compact_nodes(void) /* The written value is actually unused, all memory is compacted */ int sysctl_compact_memory; +/* + * Tunable for proactive compaction. It determines how + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ +int __read_mostly sysctl_compaction_proactiveness = 20; + /* * This is the entry point for compacting all nodes via * /proc/sys/vm/compact_memory @@ -2646,6 +2796,7 @@ static int kcompactd(void *p) { pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; + unsigned int proactive_defer = 0; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); @@ -2661,12 +2812,34 @@ static int kcompactd(void *p) unsigned long pflags; trace_mm_compaction_kcompactd_sleep(pgdat->node_id); - wait_event_freezable(pgdat->kcompactd_wait, - kcompactd_work_requested(pgdat)); + if (wait_event_freezable_timeout(pgdat->kcompactd_wait, + kcompactd_work_requested(pgdat), + msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) { + + psi_memstall_enter(&pflags); + kcompactd_do_work(pgdat); + psi_memstall_leave(&pflags); + continue; + } - psi_memstall_enter(&pflags); - kcompactd_do_work(pgdat); - psi_memstall_leave(&pflags); + /* kcompactd wait timeout */ + if (should_proactive_compact_node(pgdat)) { + unsigned int prev_score, score; + + if (proactive_defer) { + proactive_defer--; + continue; + } + prev_score = fragmentation_score_node(pgdat); + proactive_compact_node(pgdat); + score = fragmentation_score_node(pgdat); + /* + * Defer proactive compaction if the fragmentation + * score did not go down i.e. no progress made. + */ + proactive_defer = score < prev_score ? + 0 : 1 << COMPACT_MAX_DEFER_SHIFT; + } } return 0; diff --git a/mm/internal.h b/mm/internal.h index 9886db20d94f..42cf0b610847 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -239,6 +239,7 @@ struct compact_control { bool no_set_skip_hint; /* Don't mark blocks for skipping */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ + bool proactive_compaction; /* kcompactd proactive compaction */ bool whole_zone; /* Whole zone should/has been scanned */ bool contended; /* Signal lock or sched contention */ bool rescan; /* Rescanning the same pageblock */ diff --git a/mm/vmstat.c b/mm/vmstat.c index fef03463a0cf..f183aa37994e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1096,6 +1096,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); } +/* + * Calculates external fragmentation within a zone wrt the given order. + * It is defined as the percentage of pages found in blocks of size + * less than 1 << order. It returns values in range [0, 100]. + */ +int extfrag_for_order(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + if (info.free_pages == 0) + return 0; + + return div_u64((info.free_pages - + (info.free_blocks_suitable << order)) * 100, + info.free_pages); +} + /* Same as __fragmentation index but allocs contig_page_info on stack */ int fragmentation_index(struct zone *zone, unsigned int order) { -- cgit v1.2.3 From d34c0a7599ea8c301bc471dfa1eb2bf2db6752d1 Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Tue, 11 Aug 2020 18:31:07 -0700 Subject: mm: use unsigned types for fragmentation score Proactive compaction uses per-node/zone "fragmentation score" which is always in range [0, 100], so use unsigned type of these scores as well as for related constants. Signed-off-by: Nitin Gupta Signed-off-by: Andrew Morton Reviewed-by: Baoquan He Cc: Luis Chamberlain Cc: Kees Cook Cc: Iurii Zaikin Cc: Vlastimil Babka Cc: Joonsoo Kim Link: http://lkml.kernel.org/r/20200618010319.13159-1-nigupta@nvidia.com Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 ++-- kernel/sysctl.c | 2 +- mm/compaction.c | 18 +++++++++--------- mm/vmstat.c | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel/sysctl.c') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7a242d46454e..25a521d299c1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -85,13 +85,13 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; -extern int sysctl_compaction_proactiveness; +extern unsigned int sysctl_compaction_proactiveness; extern int sysctl_compaction_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; -extern int extfrag_for_order(struct zone *zone, unsigned int order); +extern unsigned int extfrag_for_order(struct zone *zone, unsigned int order); extern int fragmentation_index(struct zone *zone, unsigned int order); extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index ab72e52f8a7b..287862f91717 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2854,7 +2854,7 @@ static struct ctl_table vm_table[] = { { .procname = "compaction_proactiveness", .data = &sysctl_compaction_proactiveness, - .maxlen = sizeof(int), + .maxlen = sizeof(sysctl_compaction_proactiveness), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, diff --git a/mm/compaction.c b/mm/compaction.c index ed8ea1511634..b7d433f1706a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -53,7 +53,7 @@ static inline void count_compact_events(enum vm_event_item item, long delta) /* * Fragmentation score check interval for proactive compaction purposes. */ -static const int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; +static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; /* * Page order with-respect-to which proactive compaction @@ -1890,7 +1890,7 @@ static bool kswapd_is_running(pg_data_t *pgdat) * ZONE_DMA32. For smaller zones, the score value remains close to zero, * and thus never exceeds the high threshold for proactive compaction. */ -static int fragmentation_score_zone(struct zone *zone) +static unsigned int fragmentation_score_zone(struct zone *zone) { unsigned long score; @@ -1906,9 +1906,9 @@ static int fragmentation_score_zone(struct zone *zone) * the node's score falls below the low threshold, or one of the back-off * conditions is met. */ -static int fragmentation_score_node(pg_data_t *pgdat) +static unsigned int fragmentation_score_node(pg_data_t *pgdat) { - unsigned long score = 0; + unsigned int score = 0; int zoneid; for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { @@ -1921,17 +1921,17 @@ static int fragmentation_score_node(pg_data_t *pgdat) return score; } -static int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) { - int wmark_low; + unsigned int wmark_low; /* * Cap the low watermak to avoid excessive compaction * activity in case a user sets the proactivess tunable * close to 100 (maximum). */ - wmark_low = max(100 - sysctl_compaction_proactiveness, 5); - return low ? wmark_low : min(wmark_low + 10, 100); + wmark_low = max(100U - sysctl_compaction_proactiveness, 5U); + return low ? wmark_low : min(wmark_low + 10, 100U); } static bool should_proactive_compact_node(pg_data_t *pgdat) @@ -2615,7 +2615,7 @@ int sysctl_compact_memory; * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ -int __read_mostly sysctl_compaction_proactiveness = 20; +unsigned int __read_mostly sysctl_compaction_proactiveness = 20; /* * This is the entry point for compacting all nodes via diff --git a/mm/vmstat.c b/mm/vmstat.c index f183aa37994e..3bb034c99887 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1101,7 +1101,7 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in * It is defined as the percentage of pages found in blocks of size * less than 1 << order. It returns values in range [0, 100]. */ -int extfrag_for_order(struct zone *zone, unsigned int order) +unsigned int extfrag_for_order(struct zone *zone, unsigned int order) { struct contig_page_info info; -- cgit v1.2.3