summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c136
-rw-r--r--kernel/audit.c33
-rw-r--r--kernel/auditfilter.c15
-rw-r--r--kernel/auditsc.c27
-rw-r--r--kernel/bpf/arena.c20
-rw-r--r--kernel/bpf/arraymap.c6
-rw-r--r--kernel/bpf/bpf_cgrp_storage.c17
-rw-r--r--kernel/bpf/bpf_inode_storage.c9
-rw-r--r--kernel/bpf/bpf_local_storage.c38
-rw-r--r--kernel/bpf/bpf_struct_ops.c21
-rw-r--r--kernel/bpf/bpf_task_storage.c15
-rw-r--r--kernel/bpf/btf.c33
-rw-r--r--kernel/bpf/cpumap.c2
-rw-r--r--kernel/bpf/cpumask.c2
-rw-r--r--kernel/bpf/devmap.c8
-rw-r--r--kernel/bpf/hashtab.c79
-rw-r--r--kernel/bpf/helpers.c43
-rw-r--r--kernel/bpf/log.c21
-rw-r--r--kernel/bpf/lpm_trie.c20
-rw-r--r--kernel/bpf/range_tree.c2
-rw-r--r--kernel/bpf/ringbuf.c4
-rw-r--r--kernel/bpf/syscall.c57
-rw-r--r--kernel/bpf/sysfs_btf.c12
-rw-r--r--kernel/bpf/verifier.c1193
-rw-r--r--kernel/capability.c8
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup.c20
-rw-r--r--kernel/cgroup/cpuset.c44
-rw-r--r--kernel/cgroup/dmem.c829
-rw-r--r--kernel/cgroup/rstat.c1
-rw-r--r--kernel/cpu.c16
-rw-r--r--kernel/cred.c50
-rw-r--r--kernel/debug/kdb/kdb_support.c24
-rw-r--r--kernel/delayacct.c57
-rw-r--r--kernel/events/core.c156
-rw-r--r--kernel/events/ring_buffer.c19
-rw-r--r--kernel/events/uprobes.c233
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c56
-rw-r--r--kernel/futex/core.c5
-rw-r--r--kernel/futex/futex.h11
-rw-r--r--kernel/futex/pi.c7
-rw-r--r--kernel/futex/waitwake.c7
-rw-r--r--kernel/gcov/clang.c6
-rwxr-xr-xkernel/gen_kheaders.sh43
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig6
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/chip.c4
-rw-r--r--kernel/irq/debugfs.c2
-rw-r--r--kernel/irq/generic-chip.c1
-rw-r--r--kernel/irq/internals.h10
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/kexec.c36
-rw-r--r--kernel/irq/manage.c41
-rw-r--r--kernel/irq/resend.c2
-rw-r--r--kernel/irq/settings.h6
-rw-r--r--kernel/irq/timings.c1
-rw-r--r--kernel/irq_work.c2
-rw-r--r--kernel/kallsyms_selftest.c4
-rw-r--r--kernel/kcov.c2
-rw-r--r--kernel/kexec_core.c25
-rw-r--r--kernel/kheaders.c19
-rw-r--r--kernel/kprobes.c594
-rw-r--r--kernel/ksysfs.c21
-rw-r--r--kernel/kthread.c208
-rw-r--r--kernel/latencytop.c8
-rw-r--r--kernel/livepatch/core.c24
-rw-r--r--kernel/locking/lockdep.c6
-rw-r--r--kernel/locking/lockdep_internals.h3
-rw-r--r--kernel/locking/locktorture.c6
-rw-r--r--kernel/locking/mutex.c16
-rw-r--r--kernel/locking/rtmutex.c26
-rw-r--r--kernel/locking/rtmutex_api.c2
-rw-r--r--kernel/locking/test-ww_mutex.c9
-rw-r--r--kernel/module/Kconfig56
-rw-r--r--kernel/module/internal.h28
-rw-r--r--kernel/module/main.c168
-rw-r--r--kernel/module/strict_rwx.c13
-rw-r--r--kernel/module/sysfs.c122
-rw-r--r--kernel/module/version.c47
-rw-r--r--kernel/padata.c45
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c22
-rw-r--r--kernel/pid.c139
-rw-r--r--kernel/pid_namespace.c45
-rw-r--r--kernel/pid_sysctl.h2
-rw-r--r--kernel/power/Kconfig21
-rw-r--r--kernel/power/autosleep.c1
-rw-r--r--kernel/power/energy_model.c17
-rw-r--r--kernel/power/hibernate.c7
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/printk/internal.h6
-rw-r--r--kernel/printk/printk.c7
-rw-r--r--kernel/printk/printk_safe.c16
-rw-r--r--kernel/printk/sysctl.c2
-rw-r--r--kernel/rcu/Kconfig.debug31
-rw-r--r--kernel/rcu/rcutorture.c237
-rw-r--r--kernel/rcu/refscale.c40
-rw-r--r--kernel/rcu/srcutree.c4
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c975
-rw-r--r--kernel/rcu/tree_exp.h72
-rw-r--r--kernel/rcu/tree_plugin.h12
-rw-r--r--kernel/rcu/update.c4
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c3
-rw-r--r--kernel/rseq.c103
-rw-r--r--kernel/sched/autogroup.c6
-rw-r--r--kernel/sched/core.c144
-rw-r--r--kernel/sched/cpufreq_schedutil.c43
-rw-r--r--kernel/sched/cputime.c16
-rw-r--r--kernel/sched/deadline.c121
-rw-r--r--kernel/sched/debug.c27
-rw-r--r--kernel/sched/ext.c535
-rw-r--r--kernel/sched/ext.h12
-rw-r--r--kernel/sched/fair.c616
-rw-r--r--kernel/sched/features.h9
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/isolation.c22
-rw-r--r--kernel/sched/pelt.c4
-rw-r--r--kernel/sched/psi.c7
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h115
-rw-r--r--kernel/sched/stats.c11
-rw-r--r--kernel/sched/stats.h13
-rw-r--r--kernel/sched/syscalls.c25
-rw-r--r--kernel/sched/topology.c14
-rw-r--r--kernel/seccomp.c14
-rw-r--r--kernel/signal.c39
-rw-r--r--kernel/smp.c9
-rw-r--r--kernel/stackleak.c5
-rw-r--r--kernel/static_call_inline.c2
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/sysctl-test.c6
-rw-r--r--kernel/sysctl.c13
-rw-r--r--kernel/task_work.c14
-rw-r--r--kernel/time/clocksource-wdtest.c3
-rw-r--r--kernel/time/clocksource.c9
-rw-r--r--kernel/time/hrtimer.c143
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/timekeeping.c77
-rw-r--r--kernel/time/timer.c18
-rw-r--r--kernel/time/timer_migration.c78
-rw-r--r--kernel/time/timer_migration.h21
-rw-r--r--kernel/trace/Kconfig22
-rw-r--r--kernel/trace/blktrace.c36
-rw-r--r--kernel/trace/bpf_trace.c97
-rw-r--r--kernel/trace/fgraph.c76
-rw-r--r--kernel/trace/fprobe.c662
-rw-r--r--kernel/trace/ftrace.c266
-rw-r--r--kernel/trace/pid_list.c2
-rw-r--r--kernel/trace/ring_buffer.c106
-rw-r--r--kernel/trace/rv/Kconfig27
-rw-r--r--kernel/trace/rv/Makefile3
-rw-r--r--kernel/trace/rv/monitors/wip/Kconfig12
-rw-r--r--kernel/trace/rv/monitors/wip/wip.c2
-rw-r--r--kernel/trace/rv/monitors/wip/wip_trace.h15
-rw-r--r--kernel/trace/rv/monitors/wwnr/Kconfig11
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr.c2
-rw-r--r--kernel/trace/rv/monitors/wwnr/wwnr_trace.h16
-rw-r--r--kernel/trace/rv/rv.c2
-rw-r--r--kernel/trace/rv/rv_trace.h130
-rw-r--r--kernel/trace/trace.c585
-rw-r--r--kernel/trace/trace.h32
-rw-r--r--kernel/trace/trace_dynevent.c23
-rw-r--r--kernel/trace/trace_entries.h8
-rw-r--r--kernel/trace/trace_eprobe.c36
-rw-r--r--kernel/trace/trace_events.c709
-rw-r--r--kernel/trace/trace_events_filter.c23
-rw-r--r--kernel/trace/trace_events_hist.c119
-rw-r--r--kernel/trace/trace_events_synth.c17
-rw-r--r--kernel/trace/trace_events_trigger.c67
-rw-r--r--kernel/trace/trace_events_user.c2
-rw-r--r--kernel/trace/trace_fprobe.c270
-rw-r--r--kernel/trace/trace_functions.c9
-rw-r--r--kernel/trace/trace_functions_graph.c82
-rw-r--r--kernel/trace/trace_irqsoff.c23
-rw-r--r--kernel/trace/trace_kprobe.c151
-rw-r--r--kernel/trace/trace_osnoise.c57
-rw-r--r--kernel/trace/trace_output.c6
-rw-r--r--kernel/trace/trace_probe.c51
-rw-r--r--kernel/trace/trace_probe_tmpl.h2
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c24
-rw-r--r--kernel/trace/trace_selftest.c11
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_stat.c26
-rw-r--r--kernel/trace/trace_uprobe.c15
-rw-r--r--kernel/ucount.c8
-rw-r--r--kernel/umh.c2
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/watch_queue.c4
-rw-r--r--kernel/watchdog.c4
-rw-r--r--kernel/workqueue.c46
198 files changed, 7720 insertions, 5156 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 179848ad33e97..6520baa136693 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -76,7 +76,7 @@ static int acct_parm[3] = {4, 2, 30};
#define ACCT_TIMEOUT (acct_parm[2]) /* foo second timeout between checks */
#ifdef CONFIG_SYSCTL
-static struct ctl_table kern_acct_table[] = {
+static const struct ctl_table kern_acct_table[] = {
{
.procname = "acct",
.data = &acct_parm,
@@ -103,48 +103,50 @@ struct bsd_acct_struct {
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
- int active;
+ bool active;
+ bool check_space;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
+ acct_t ac;
};
-static void do_acct_process(struct bsd_acct_struct *acct);
+static void fill_ac(struct bsd_acct_struct *acct);
+static void acct_write_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
-static int check_free_space(struct bsd_acct_struct *acct)
+static bool check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_after_jiffies(acct->needcheck))
- goto out;
+ if (!acct->check_space)
+ return acct->active;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
- goto out;
+ return acct->active;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
- acct->active = 0;
+ acct->active = false;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
- acct->active = 1;
+ acct->active = true;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
-out:
return acct->active;
}
@@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
- do_acct_process(acct);
+ /*
+ * Fill the accounting struct with the exiting task's info
+ * before punting to the workqueue.
+ */
+ fill_ac(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
@@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
+
+ /* We were fired by acct_pin_kill() which holds acct->lock. */
+ acct_write_process(acct);
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
@@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
return -EACCES;
}
+ /* Exclude kernel kernel internal filesystems. */
+ if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EINVAL;
+ }
+
+ /* Exclude procfs and sysfs. */
+ if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
+ kfree(acct);
+ filp_close(file, NULL);
+ return -EINVAL;
+ }
+
if (!(file->f_mode & FMODE_CAN_WRITE)) {
kfree(acct);
filp_close(file, NULL);
@@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
-static void fill_ac(acct_t *ac)
+static void fill_ac(struct bsd_acct_struct *acct)
{
struct pacct_struct *pacct = &current->signal->pacct;
+ struct file *file = acct->file;
+ acct_t *ac = &acct->ac;
u64 elapsed, run_time;
time64_t btime;
struct tty_struct *tty;
+ lockdep_assert_held(&acct->lock);
+
+ if (time_is_after_jiffies(acct->needcheck)) {
+ acct->check_space = false;
+
+ /* Don't fill in @ac if nothing will be written. */
+ if (!acct->active)
+ return;
+ } else {
+ acct->check_space = true;
+ }
+
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
@@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
-}
-/*
- * do_acct_process does all actual work. Caller holds the reference to file.
- */
-static void do_acct_process(struct bsd_acct_struct *acct)
-{
- acct_t ac;
- unsigned long flim;
- const struct cred *orig_cred;
- struct file *file = acct->file;
- /*
- * Accounting records are not subject to resource limits.
- */
- flim = rlimit(RLIMIT_FSIZE);
- current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
- /* Perform file operations on behalf of whoever enabled accounting */
- orig_cred = override_creds(file->f_cred);
-
- /*
- * First check to see if there is enough free_space to continue
- * the process accounting system.
- */
- if (!check_free_space(acct))
- goto out;
-
- fill_ac(&ac);
/* we really need to bite the bullet and change layout */
- ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
- ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
+ ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
+ ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
- ac.ac_uid16 = ac.ac_uid;
- ac.ac_gid16 = ac.ac_gid;
+ ac->ac_uid16 = ac->ac_uid;
+ ac->ac_gid16 = ac->ac_gid;
#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
- ac.ac_pid = task_tgid_nr_ns(current, ns);
+ ac->ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
- ns);
+ ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
}
#endif
+}
+
+static void acct_write_process(struct bsd_acct_struct *acct)
+{
+ struct file *file = acct->file;
+ const struct cred *cred;
+ acct_t *ac = &acct->ac;
+
+ /* Perform file operations on behalf of whoever enabled accounting */
+ cred = override_creds(file->f_cred);
+
/*
- * Get freeze protection. If the fs is frozen, just skip the write
- * as we could deadlock the system otherwise.
+ * First check to see if there is enough free_space to continue
+ * the process accounting system. Then get freeze protection. If
+ * the fs is frozen, just skip the write as we could deadlock
+ * the system otherwise.
*/
- if (file_start_write_trylock(file)) {
+ if (check_free_space(acct) && file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
- __kernel_write(file, &ac, sizeof(acct_t), &pos);
+ __kernel_write(file, ac, sizeof(acct_t), &pos);
file_end_write(file);
}
-out:
+
+ revert_creds(cred);
+}
+
+static void do_acct_process(struct bsd_acct_struct *acct)
+{
+ unsigned long flim;
+
+ /* Accounting records are not subject to resource limits. */
+ flim = rlimit(RLIMIT_FSIZE);
+ current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
+ fill_ac(acct);
+ acct_write_process(acct);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
- revert_creds(orig_cred);
}
/**
diff --git a/kernel/audit.c b/kernel/audit.c
index 6a95a6077953c..5f5bf85bcc905 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1221,8 +1221,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
struct audit_sig_info *sig_data;
- char *ctx = NULL;
- u32 len;
+ struct lsm_context lsmctx = { NULL, 0, 0 };
err = audit_netlink_ok(skb, msg_type);
if (err)
@@ -1472,27 +1471,28 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh,
break;
}
case AUDIT_SIGNAL_INFO:
- len = 0;
if (lsmprop_is_set(&audit_sig_lsm)) {
- err = security_lsmprop_to_secctx(&audit_sig_lsm, &ctx,
- &len);
- if (err)
+ err = security_lsmprop_to_secctx(&audit_sig_lsm,
+ &lsmctx);
+ if (err < 0)
return err;
}
- sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
+ sig_data = kmalloc(struct_size(sig_data, ctx, lsmctx.len),
+ GFP_KERNEL);
if (!sig_data) {
if (lsmprop_is_set(&audit_sig_lsm))
- security_release_secctx(ctx, len);
+ security_release_secctx(&lsmctx);
return -ENOMEM;
}
sig_data->uid = from_kuid(&init_user_ns, audit_sig_uid);
sig_data->pid = audit_sig_pid;
if (lsmprop_is_set(&audit_sig_lsm)) {
- memcpy(sig_data->ctx, ctx, len);
- security_release_secctx(ctx, len);
+ memcpy(sig_data->ctx, lsmctx.context, lsmctx.len);
+ security_release_secctx(&lsmctx);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
- sig_data, struct_size(sig_data, ctx, len));
+ sig_data, struct_size(sig_data, ctx,
+ lsmctx.len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@@ -2180,23 +2180,22 @@ void audit_log_key(struct audit_buffer *ab, char *key)
int audit_log_task_context(struct audit_buffer *ab)
{
struct lsm_prop prop;
- char *ctx = NULL;
- unsigned len;
+ struct lsm_context ctx;
int error;
security_current_getlsmprop_subj(&prop);
if (!lsmprop_is_set(&prop))
return 0;
- error = security_lsmprop_to_secctx(&prop, &ctx, &len);
- if (error) {
+ error = security_lsmprop_to_secctx(&prop, &ctx);
+ if (error < 0) {
if (error != -EINVAL)
goto error_path;
return 0;
}
- audit_log_format(ab, " subj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, " subj=%s", ctx.context);
+ security_release_secctx(&ctx);
return 0;
error_path:
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index bceb9f58a09ee..e3f42018ed46f 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1319,13 +1319,20 @@ int audit_compare_dname_path(const struct qstr *dname, const char *path, int par
if (pathlen < dlen)
return 1;
- parentlen = parentlen == AUDIT_NAME_FULL ? parent_len(path) : parentlen;
- if (pathlen - parentlen != dlen)
- return 1;
+ if (parentlen == AUDIT_NAME_FULL)
+ parentlen = parent_len(path);
p = path + parentlen;
- return strncmp(p, dname->name, dlen);
+ /* handle trailing slashes */
+ pathlen -= parentlen;
+ while (p[pathlen - 1] == '/')
+ pathlen--;
+
+ if (pathlen != dlen)
+ return 1;
+
+ return memcmp(p, dname->name, dlen);
}
int audit_filter(int msgtype, unsigned int listtype)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 561d96affe9f5..9c853cde9abe4 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1098,8 +1098,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
char *comm)
{
struct audit_buffer *ab;
- char *ctx = NULL;
- u32 len;
+ struct lsm_context ctx;
int rc = 0;
ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
@@ -1110,12 +1109,12 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
from_kuid(&init_user_ns, auid),
from_kuid(&init_user_ns, uid), sessionid);
if (lsmprop_is_set(prop)) {
- if (security_lsmprop_to_secctx(prop, &ctx, &len)) {
+ if (security_lsmprop_to_secctx(prop, &ctx) < 0) {
audit_log_format(ab, " obj=(none)");
rc = 1;
} else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, " obj=%s", ctx.context);
+ security_release_secctx(&ctx);
}
}
audit_log_format(ab, " ocomm=");
@@ -1393,15 +1392,14 @@ static void show_special(struct audit_context *context, int *call_panic)
from_kgid(&init_user_ns, context->ipc.gid),
context->ipc.mode);
if (lsmprop_is_set(&context->ipc.oprop)) {
- char *ctx = NULL;
- u32 len;
+ struct lsm_context lsmctx;
if (security_lsmprop_to_secctx(&context->ipc.oprop,
- &ctx, &len)) {
+ &lsmctx) < 0) {
*call_panic = 1;
} else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, " obj=%s", lsmctx.context);
+ security_release_secctx(&lsmctx);
}
}
if (context->ipc.has_perm) {
@@ -1560,15 +1558,14 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
MAJOR(n->rdev),
MINOR(n->rdev));
if (lsmprop_is_set(&n->oprop)) {
- char *ctx = NULL;
- u32 len;
+ struct lsm_context ctx;
- if (security_lsmprop_to_secctx(&n->oprop, &ctx, &len)) {
+ if (security_lsmprop_to_secctx(&n->oprop, &ctx) < 0) {
if (call_panic)
*call_panic = 2;
} else {
- audit_log_format(ab, " obj=%s", ctx);
- security_release_secctx(ctx, len);
+ audit_log_format(ab, " obj=%s", ctx.context);
+ security_release_secctx(&ctx);
}
}
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 945a5680f6a54..095a9554e1def 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -39,7 +39,7 @@
*/
/* number of bytes addressable by LDX/STX insn with 16-bit 'off' field */
-#define GUARD_SZ (1ull << sizeof_field(struct bpf_insn, off) * 8)
+#define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
#define KERN_VM_SZ (SZ_4G + GUARD_SZ)
struct bpf_arena {
@@ -138,7 +138,11 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
INIT_LIST_HEAD(&arena->vma_list);
bpf_map_init_from_attr(&arena->map, attr);
range_tree_init(&arena->rt);
- range_tree_set(&arena->rt, 0, attr->max_entries);
+ err = range_tree_set(&arena->rt, 0, attr->max_entries);
+ if (err) {
+ bpf_map_area_free(arena);
+ goto err;
+ }
mutex_init(&arena->lock);
return &arena->map;
@@ -218,7 +222,7 @@ static u64 arena_map_mem_usage(const struct bpf_map *map)
struct vma_list {
struct vm_area_struct *vma;
struct list_head head;
- atomic_t mmap_count;
+ refcount_t mmap_count;
};
static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
@@ -228,7 +232,7 @@ static int remember_vma(struct bpf_arena *arena, struct vm_area_struct *vma)
vml = kmalloc(sizeof(*vml), GFP_KERNEL);
if (!vml)
return -ENOMEM;
- atomic_set(&vml->mmap_count, 1);
+ refcount_set(&vml->mmap_count, 1);
vma->vm_private_data = vml;
vml->vma = vma;
list_add(&vml->head, &arena->vma_list);
@@ -239,7 +243,7 @@ static void arena_vm_open(struct vm_area_struct *vma)
{
struct vma_list *vml = vma->vm_private_data;
- atomic_inc(&vml->mmap_count);
+ refcount_inc(&vml->mmap_count);
}
static void arena_vm_close(struct vm_area_struct *vma)
@@ -248,7 +252,7 @@ static void arena_vm_close(struct vm_area_struct *vma)
struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
struct vma_list *vml = vma->vm_private_data;
- if (!atomic_dec_and_test(&vml->mmap_count))
+ if (!refcount_dec_and_test(&vml->mmap_count))
return;
guard(mutex)(&arena->lock);
/* update link list under lock */
@@ -257,8 +261,6 @@ static void arena_vm_close(struct vm_area_struct *vma)
kfree(vml);
}
-#define MT_ENTRY ((void *)&arena_map_ops) /* unused. has to be valid pointer */
-
static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
{
struct bpf_map *map = vmf->vma->vm_file->private_data;
@@ -443,7 +445,7 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
return 0;
}
- /* zeroing is needed, since alloc_pages_bulk_array() only fills in non-zero entries */
+ /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
if (!pages)
return 0;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 6cdbb4c33d31d..eb28c0f219ee4 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -735,13 +735,13 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
u64 ret = 0;
void *val;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
array = container_of(map, struct bpf_array, map);
- if (is_percpu)
- migrate_disable();
for (i = 0; i < map->max_entries; i++) {
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
@@ -756,8 +756,6 @@ static long bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback
break;
}
- if (is_percpu)
- migrate_enable();
return num_elems;
}
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index 20f05de92e9c3..54ff2a85d4c02 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -15,22 +15,20 @@ static DEFINE_PER_CPU(int, bpf_cgrp_storage_busy);
static void bpf_cgrp_storage_lock(void)
{
- migrate_disable();
+ cant_migrate();
this_cpu_inc(bpf_cgrp_storage_busy);
}
static void bpf_cgrp_storage_unlock(void)
{
this_cpu_dec(bpf_cgrp_storage_busy);
- migrate_enable();
}
static bool bpf_cgrp_storage_trylock(void)
{
- migrate_disable();
+ cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_cgrp_storage_busy) != 1)) {
this_cpu_dec(bpf_cgrp_storage_busy);
- migrate_enable();
return false;
}
return true;
@@ -47,17 +45,18 @@ void bpf_cgrp_storage_free(struct cgroup *cgroup)
{
struct bpf_local_storage *local_storage;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(cgroup->bpf_cgrp_storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_cgrp_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_cgrp_storage_unlock();
+out:
rcu_read_unlock();
+ migrate_enable();
}
static struct bpf_local_storage_data *
@@ -154,7 +153,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
static void cgroup_storage_map_free(struct bpf_map *map)
{
- bpf_local_storage_map_free(map, &cgroup_cache, NULL);
+ bpf_local_storage_map_free(map, &cgroup_cache, &bpf_cgrp_storage_busy);
}
/* *gfp_flags* is a hidden argument provided by the verifier */
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index a51c82dee1bd8..15a3eb9b02d94 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -62,16 +62,17 @@ void bpf_inode_storage_free(struct inode *inode)
if (!bsb)
return;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(bsb->storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_local_storage_destroy(local_storage);
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void *bpf_fd_inode_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 7e6a0af0afc16..fa56c30833ff1 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -81,9 +81,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
if (smap->bpf_ma) {
- migrate_disable();
selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
- migrate_enable();
if (selem)
/* Keep the original bpf_map_kzalloc behavior
* before started using the bpf_mem_cache_alloc.
@@ -174,17 +172,14 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
return;
}
- if (smap) {
- migrate_disable();
+ if (smap)
bpf_mem_cache_free(&smap->storage_ma, local_storage);
- migrate_enable();
- } else {
+ else
/* smap could be NULL if the selem that triggered
* this 'local_storage' creation had been long gone.
* In this case, directly do call_rcu().
*/
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
- }
}
/* rcu tasks trace callback for bpf_ma == false */
@@ -217,7 +212,10 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
/* The bpf_local_storage_map_free will wait for rcu_barrier */
smap = rcu_dereference_check(SDATA(selem)->smap, 1);
+
+ migrate_disable();
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ migrate_enable();
bpf_mem_cache_raw_free(selem);
}
@@ -256,9 +254,7 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
* bpf_mem_cache_free will be able to reuse selem
* immediately.
*/
- migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
- migrate_enable();
return;
}
@@ -497,15 +493,11 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->bpf_ma) {
- migrate_disable();
+ if (smap->bpf_ma)
storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
- migrate_enable();
- } else {
+ else
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN);
- }
-
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -841,8 +833,12 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]);
- smap->bpf_ma = bpf_ma;
- if (bpf_ma) {
+ /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
+ * preemptible context. Thus, enforce all storages to use
+ * bpf_mem_alloc when CONFIG_PREEMPT_RT is enabled.
+ */
+ smap->bpf_ma = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : bpf_ma;
+ if (smap->bpf_ma) {
err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
if (err)
goto free_smap;
@@ -898,15 +894,11 @@ void bpf_local_storage_map_free(struct bpf_map *map,
while ((selem = hlist_entry_safe(
rcu_dereference_raw(hlist_first_rcu(&b->list)),
struct bpf_local_storage_elem, map_node))) {
- if (busy_counter) {
- migrate_disable();
+ if (busy_counter)
this_cpu_inc(*busy_counter);
- }
bpf_selem_unlink(selem, true);
- if (busy_counter) {
+ if (busy_counter)
this_cpu_dec(*busy_counter);
- migrate_enable();
- }
cond_resched_rcu();
}
rcu_read_unlock();
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 606efe32485a9..040fb1cd840b6 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -310,6 +310,20 @@ void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc)
kfree(arg_info);
}
+static bool is_module_member(const struct btf *btf, u32 id)
+{
+ const struct btf_type *t;
+
+ t = btf_type_resolve_ptr(btf, id, NULL);
+ if (!t)
+ return false;
+
+ if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t))
+ return false;
+
+ return !strcmp(btf_name_by_offset(btf, t->name_off), "module");
+}
+
int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
struct btf *btf,
struct bpf_verifier_log *log)
@@ -389,6 +403,13 @@ int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc,
goto errout;
}
+ if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) {
+ pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n",
+ st_ops->name);
+ err = -EOPNOTSUPP;
+ goto errout;
+ }
+
func_proto = btf_type_resolve_func_ptr(btf,
member->type,
NULL);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index bf7fa15fdcc6c..1109475953c01 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -24,22 +24,20 @@ static DEFINE_PER_CPU(int, bpf_task_storage_busy);
static void bpf_task_storage_lock(void)
{
- migrate_disable();
+ cant_migrate();
this_cpu_inc(bpf_task_storage_busy);
}
static void bpf_task_storage_unlock(void)
{
this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
}
static bool bpf_task_storage_trylock(void)
{
- migrate_disable();
+ cant_migrate();
if (unlikely(this_cpu_inc_return(bpf_task_storage_busy) != 1)) {
this_cpu_dec(bpf_task_storage_busy);
- migrate_enable();
return false;
}
return true;
@@ -72,18 +70,19 @@ void bpf_task_storage_free(struct task_struct *task)
{
struct bpf_local_storage *local_storage;
+ migrate_disable();
rcu_read_lock();
local_storage = rcu_dereference(task->bpf_storage);
- if (!local_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!local_storage)
+ goto out;
bpf_task_storage_lock();
bpf_local_storage_destroy(local_storage);
bpf_task_storage_unlock();
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index e5a5f023cedd5..c3223e0db2f51 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -498,11 +498,6 @@ bool btf_type_is_void(const struct btf_type *t)
return t == &btf_void;
}
-static bool btf_type_is_fwd(const struct btf_type *t)
-{
- return BTF_INFO_KIND(t->info) == BTF_KIND_FWD;
-}
-
static bool btf_type_is_datasec(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
@@ -6512,6 +6507,8 @@ static const struct bpf_raw_tp_null_args raw_tp_null_args[] = {
/* rxrpc */
{ "rxrpc_recvdata", 0x1 },
{ "rxrpc_resend", 0x10 },
+ /* skb */
+ {"kfree_skb", 0x1000},
/* sunrpc */
{ "xs_stream_read_data", 0x1 },
/* ... from xprt_cong_event event class */
@@ -7887,14 +7884,9 @@ struct btf *btf_get_by_fd(int fd)
struct btf *btf;
CLASS(fd, f)(fd);
- if (fd_empty(f))
- return ERR_PTR(-EBADF);
-
- if (fd_file(f)->f_op != &btf_fops)
- return ERR_PTR(-EINVAL);
-
- btf = fd_file(f)->private_data;
- refcount_inc(&btf->refcnt);
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf))
+ refcount_inc(&btf->refcnt);
return btf;
}
@@ -8011,17 +8003,6 @@ struct btf_module {
static LIST_HEAD(btf_modules);
static DEFINE_MUTEX(btf_module_mutex);
-static ssize_t
-btf_module_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- const struct btf *btf = bin_attr->private;
-
- memcpy(buf, btf->data + off, len);
- return len;
-}
-
static void purge_cand_cache(struct btf *btf);
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
@@ -8082,8 +8063,8 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
attr->attr.name = btf->name;
attr->attr.mode = 0444;
attr->size = btf->data_size;
- attr->private = btf;
- attr->read = btf_module_read;
+ attr->private = btf->data;
+ attr->read_new = sysfs_bin_attr_simple_read;
err = sysfs_create_bin_file(btf_kobj, attr);
if (err) {
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index a2f46785ac3b3..774accbd4a223 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -190,7 +190,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
int err;
rxq.dev = xdpf->dev_rx;
- rxq.mem = xdpf->mem;
+ rxq.mem.type = xdpf->mem_type;
/* TODO: report queue_index to xdp_rxq_info */
xdp_convert_frame_to_buff(xdpf, &xdp);
diff --git a/kernel/bpf/cpumask.c b/kernel/bpf/cpumask.c
index 33c473d676a52..cfa1c18e3a483 100644
--- a/kernel/bpf/cpumask.c
+++ b/kernel/bpf/cpumask.c
@@ -91,9 +91,7 @@ __bpf_kfunc void bpf_cpumask_release(struct bpf_cpumask *cpumask)
if (!refcount_dec_and_test(&cpumask->usage))
return;
- migrate_disable();
bpf_mem_cache_free_rcu(&bpf_cpumask_ma, cpumask);
- migrate_enable();
}
__bpf_kfunc void bpf_cpumask_release_dtor(void *cpumask)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 3aa002a47a966..482d284a15538 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -678,7 +678,7 @@ int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
}
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
int err;
@@ -701,7 +701,7 @@ int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
struct sk_buff *skb,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *nskb;
int err;
@@ -720,8 +720,8 @@ static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
}
int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
- struct bpf_prog *xdp_prog, struct bpf_map *map,
- bool exclude_ingress)
+ const struct bpf_prog *xdp_prog,
+ struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3ec941a0ea41c..4a9eeb7aef855 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -824,13 +824,14 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
if (l == tgt_l) {
hlist_nulls_del_rcu(&l->hash_node);
- check_and_free_fields(htab, l);
bpf_map_dec_elem_count(&htab->map);
break;
}
htab_unlock_bucket(htab, b, tgt_l->hash, flags);
+ if (l == tgt_l)
+ check_and_free_fields(htab, l);
return l == tgt_l;
}
@@ -897,11 +898,9 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
{
check_and_free_fields(htab, l);
- migrate_disable();
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
bpf_mem_cache_free(&htab->pcpu_ma, l->ptr_to_pptr);
bpf_mem_cache_free(&htab->ma, l);
- migrate_enable();
}
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
@@ -1502,10 +1501,9 @@ static void delete_all_elements(struct bpf_htab *htab)
{
int i;
- /* It's called from a worker thread, so disable migration here,
- * since bpf_mem_cache_free() relies on that.
+ /* It's called from a worker thread and migration has been disabled,
+ * therefore, it is OK to invoke bpf_mem_cache_free() directly.
*/
- migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
struct hlist_nulls_head *head = select_bucket(htab, i);
struct hlist_nulls_node *n;
@@ -1517,7 +1515,6 @@ static void delete_all_elements(struct bpf_htab *htab)
}
cond_resched();
}
- migrate_enable();
}
static void htab_free_malloced_timers_and_wq(struct bpf_htab *htab)
@@ -1638,41 +1635,44 @@ static int __htab_map_lookup_and_delete_elem(struct bpf_map *map, void *key,
l = lookup_elem_raw(head, hash, key, key_size);
if (!l) {
ret = -ENOENT;
- } else {
- if (is_percpu) {
- u32 roundup_value_size = round_up(map->value_size, 8);
- void __percpu *pptr;
- int off = 0, cpu;
+ goto out_unlock;
+ }
- pptr = htab_elem_get_ptr(l, key_size);
- for_each_possible_cpu(cpu) {
- copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
- check_and_init_map_value(&htab->map, value + off);
- off += roundup_value_size;
- }
- } else {
- u32 roundup_key_size = round_up(map->key_size, 8);
+ if (is_percpu) {
+ u32 roundup_value_size = round_up(map->value_size, 8);
+ void __percpu *pptr;
+ int off = 0, cpu;
- if (flags & BPF_F_LOCK)
- copy_map_value_locked(map, value, l->key +
- roundup_key_size,
- true);
- else
- copy_map_value(map, value, l->key +
- roundup_key_size);
- /* Zeroing special fields in the temp buffer */
- check_and_init_map_value(map, value);
+ pptr = htab_elem_get_ptr(l, key_size);
+ for_each_possible_cpu(cpu) {
+ copy_map_value_long(&htab->map, value + off, per_cpu_ptr(pptr, cpu));
+ check_and_init_map_value(&htab->map, value + off);
+ off += roundup_value_size;
}
+ } else {
+ u32 roundup_key_size = round_up(map->key_size, 8);
- hlist_nulls_del_rcu(&l->hash_node);
- if (!is_lru_map)
- free_htab_elem(htab, l);
+ if (flags & BPF_F_LOCK)
+ copy_map_value_locked(map, value, l->key +
+ roundup_key_size,
+ true);
+ else
+ copy_map_value(map, value, l->key +
+ roundup_key_size);
+ /* Zeroing special fields in the temp buffer */
+ check_and_init_map_value(map, value);
}
+ hlist_nulls_del_rcu(&l->hash_node);
+out_unlock:
htab_unlock_bucket(htab, b, hash, bflags);
- if (is_lru_map && l)
- htab_lru_push_free(htab, l);
+ if (l) {
+ if (is_lru_map)
+ htab_lru_push_free(htab, l);
+ else
+ free_htab_elem(htab, l);
+ }
return ret;
}
@@ -2208,17 +2208,18 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
bool is_percpu;
u64 ret = 0;
+ cant_migrate();
+
if (flags != 0)
return -EINVAL;
is_percpu = htab_is_percpu(htab);
roundup_key_size = round_up(map->key_size, 8);
- /* disable migration so percpu value prepared here will be the
- * same as the one seen by the bpf program with bpf_map_lookup_elem().
+ /* migration has been disabled, so percpu value prepared here will be
+ * the same as the one seen by the bpf program with
+ * bpf_map_lookup_elem().
*/
- if (is_percpu)
- migrate_disable();
for (i = 0; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
rcu_read_lock();
@@ -2244,8 +2245,6 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_
rcu_read_unlock();
}
out:
- if (is_percpu)
- migrate_enable();
return num_elems;
}
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 751c150f9e1cd..f27ce162427ab 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1593,10 +1593,24 @@ void bpf_timer_cancel_and_free(void *val)
* To avoid these issues, punt to workqueue context when we are in a
* timer callback.
*/
- if (this_cpu_read(hrtimer_running))
+ if (this_cpu_read(hrtimer_running)) {
queue_work(system_unbound_wq, &t->cb.delete_work);
- else
+ return;
+ }
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ /* If the timer is running on other CPU, also use a kworker to
+ * wait for the completion of the timer instead of trying to
+ * acquire a sleepable lock in hrtimer_cancel() to wait for its
+ * completion.
+ */
+ if (hrtimer_try_to_cancel(&t->timer) >= 0)
+ kfree_rcu(t, cb.rcu);
+ else
+ queue_work(system_unbound_wq, &t->cb.delete_work);
+ } else {
bpf_timer_delete_work(&t->cb.delete_work);
+ }
}
/* This function is called by map_delete/update_elem for individual element and
@@ -2066,9 +2080,7 @@ unlock:
/* The contained type can also have resources, including a
* bpf_list_head which needs to be freed.
*/
- migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
- migrate_enable();
}
}
@@ -2105,9 +2117,7 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
obj -= field->graph_root.node_offset;
- migrate_disable();
__bpf_obj_drop_impl(obj, field->graph_root.value_rec, false);
- migrate_enable();
}
}
@@ -3057,6 +3067,21 @@ __bpf_kfunc int bpf_copy_from_user_str(void *dst, u32 dst__sz, const void __user
return ret + 1;
}
+/* Keep unsinged long in prototype so that kfunc is usable when emitted to
+ * vmlinux.h in BPF programs directly, but note that while in BPF prog, the
+ * unsigned long always points to 8-byte region on stack, the kernel may only
+ * read and write the 4-bytes on 32-bit.
+ */
+__bpf_kfunc void bpf_local_irq_save(unsigned long *flags__irq_flag)
+{
+ local_irq_save(*flags__irq_flag);
+}
+
+__bpf_kfunc void bpf_local_irq_restore(unsigned long *flags__irq_flag)
+{
+ local_irq_restore(*flags__irq_flag);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(generic_btf_ids)
@@ -3089,7 +3114,9 @@ BTF_ID_FLAGS(func, bpf_task_get_cgroup1, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_task_from_vpid, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_throw)
+#ifdef CONFIG_BPF_EVENTS
BTF_ID_FLAGS(func, bpf_send_signal_task, KF_TRUSTED_ARGS)
+#endif
BTF_KFUNCS_END(generic_btf_ids)
static const struct btf_kfunc_id_set generic_kfunc_set = {
@@ -3135,7 +3162,9 @@ BTF_ID_FLAGS(func, bpf_dynptr_is_null)
BTF_ID_FLAGS(func, bpf_dynptr_is_rdonly)
BTF_ID_FLAGS(func, bpf_dynptr_size)
BTF_ID_FLAGS(func, bpf_dynptr_clone)
+#ifdef CONFIG_NET
BTF_ID_FLAGS(func, bpf_modify_return_test_tp)
+#endif
BTF_ID_FLAGS(func, bpf_wq_init)
BTF_ID_FLAGS(func, bpf_wq_set_callback_impl)
BTF_ID_FLAGS(func, bpf_wq_start)
@@ -3149,6 +3178,8 @@ BTF_ID_FLAGS(func, bpf_get_kmem_cache)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_new, KF_ITER_NEW | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_next, KF_ITER_NEXT | KF_RET_NULL | KF_SLEEPABLE)
BTF_ID_FLAGS(func, bpf_iter_kmem_cache_destroy, KF_ITER_DESTROY | KF_SLEEPABLE)
+BTF_ID_FLAGS(func, bpf_local_irq_save)
+BTF_ID_FLAGS(func, bpf_local_irq_restore)
BTF_KFUNCS_END(common_btf_ids)
static const struct btf_kfunc_id_set common_kfunc_set = {
diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c
index 4a858fdb6476f..38050f4ee4003 100644
--- a/kernel/bpf/log.c
+++ b/kernel/bpf/log.c
@@ -537,6 +537,7 @@ static char slot_type_char[] = {
[STACK_ZERO] = '0',
[STACK_DYNPTR] = 'd',
[STACK_ITER] = 'i',
+ [STACK_IRQ_FLAG] = 'f'
};
static void print_liveness(struct bpf_verifier_env *env,
@@ -753,9 +754,10 @@ static void print_reg_state(struct bpf_verifier_env *env,
verbose(env, ")");
}
-void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_state *state,
- bool print_all)
+void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno, bool print_all)
{
+ const struct bpf_func_state *state = vstate->frame[frameno];
const struct bpf_reg_state *reg;
int i;
@@ -843,11 +845,11 @@ void print_verifier_state(struct bpf_verifier_env *env, const struct bpf_func_st
break;
}
}
- if (state->acquired_refs && state->refs[0].id) {
- verbose(env, " refs=%d", state->refs[0].id);
- for (i = 1; i < state->acquired_refs; i++)
- if (state->refs[i].id)
- verbose(env, ",%d", state->refs[i].id);
+ if (vstate->acquired_refs && vstate->refs[0].id) {
+ verbose(env, " refs=%d", vstate->refs[0].id);
+ for (i = 1; i < vstate->acquired_refs; i++)
+ if (vstate->refs[i].id)
+ verbose(env, ",%d", vstate->refs[i].id);
}
if (state->in_callback_fn)
verbose(env, " cb");
@@ -864,7 +866,8 @@ static inline u32 vlog_alignment(u32 pos)
BPF_LOG_MIN_ALIGNMENT) - pos - 1;
}
-void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state *state)
+void print_insn_state(struct bpf_verifier_env *env, const struct bpf_verifier_state *vstate,
+ u32 frameno)
{
if (env->prev_log_pos && env->prev_log_pos == env->log.end_pos) {
/* remove new line character */
@@ -873,5 +876,5 @@ void print_insn_state(struct bpf_verifier_env *env, const struct bpf_func_state
} else {
verbose(env, "%d:", env->insn_idx);
}
- print_verifier_state(env, state, false);
+ print_verifier_state(env, vstate, frameno, false);
}
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index f8bc1e0961823..e8a772e643242 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -289,16 +289,11 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
}
static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie,
- const void *value,
- bool disable_migration)
+ const void *value)
{
struct lpm_trie_node *node;
- if (disable_migration)
- migrate_disable();
node = bpf_mem_cache_alloc(&trie->ma);
- if (disable_migration)
- migrate_enable();
if (!node)
return NULL;
@@ -342,10 +337,8 @@ static long trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
- /* Allocate and fill a new node. Need to disable migration before
- * invoking bpf_mem_cache_alloc().
- */
- new_node = lpm_trie_node_alloc(trie, value, true);
+ /* Allocate and fill a new node */
+ new_node = lpm_trie_node_alloc(trie, value);
if (!new_node)
return -ENOMEM;
@@ -425,8 +418,7 @@ static long trie_update_elem(struct bpf_map *map,
goto out;
}
- /* migration is disabled within the locked scope */
- im_node = lpm_trie_node_alloc(trie, NULL, false);
+ im_node = lpm_trie_node_alloc(trie, NULL);
if (!im_node) {
trie->n_entries--;
ret = -ENOMEM;
@@ -452,11 +444,9 @@ static long trie_update_elem(struct bpf_map *map,
out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
- migrate_disable();
if (ret)
bpf_mem_cache_free(&trie->ma, new_node);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
- migrate_enable();
return ret;
}
@@ -555,10 +545,8 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
- migrate_disable();
bpf_mem_cache_free_rcu(&trie->ma, free_parent);
bpf_mem_cache_free_rcu(&trie->ma, free_node);
- migrate_enable();
return ret;
}
diff --git a/kernel/bpf/range_tree.c b/kernel/bpf/range_tree.c
index 5bdf9aadca3a6..37b80a23ae1ae 100644
--- a/kernel/bpf/range_tree.c
+++ b/kernel/bpf/range_tree.c
@@ -259,9 +259,7 @@ void range_tree_destroy(struct range_tree *rt)
while ((rn = range_it_iter_first(rt, 0, -1U))) {
range_it_remove(rn, rt);
- migrate_disable();
bpf_mem_free(&bpf_global_ma, rn);
- migrate_enable();
}
}
diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
index e1cfe890e0be6..1499d8caa9a35 100644
--- a/kernel/bpf/ringbuf.c
+++ b/kernel/bpf/ringbuf.c
@@ -268,8 +268,6 @@ static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma
/* allow writable mapping for the consumer_pos only */
if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb,
@@ -289,8 +287,6 @@ static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma
* position, and the ring buffer data itself.
*/
return -EPERM;
- } else {
- vm_flags_clear(vma, VM_MAYWRITE);
}
/* remap_vmalloc_range() checks size and offset constraints */
return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 5684e8ce132d5..e1e42e918ba7f 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -796,11 +796,9 @@ void bpf_obj_free_fields(const struct btf_record *rec, void *obj)
if (!btf_is_kernel(field->kptr.btf)) {
pointee_struct_meta = btf_find_struct_meta(field->kptr.btf,
field->kptr.btf_id);
- migrate_disable();
__bpf_obj_drop_impl(xchgd_field, pointee_struct_meta ?
pointee_struct_meta->record : NULL,
fields[i].type == BPF_KPTR_PERCPU);
- migrate_enable();
} else {
field->kptr.dtor(xchgd_field);
}
@@ -835,8 +833,14 @@ static void bpf_map_free(struct bpf_map *map)
struct btf_record *rec = map->record;
struct btf *btf = map->btf;
- /* implementation dependent freeing */
+ /* implementation dependent freeing. Disabling migration to simplify
+ * the free of values or special fields allocated from bpf memory
+ * allocator.
+ */
+ migrate_disable();
map->ops->map_free(map);
+ migrate_enable();
+
/* Delay freeing of btf_record for maps, as map_free
* callback usually needs access to them. It is better to do it here
* than require each callback to do the free itself manually.
@@ -1031,7 +1035,7 @@ static const struct vm_operations_struct bpf_map_default_vmops = {
static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
{
struct bpf_map *map = filp->private_data;
- int err;
+ int err = 0;
if (!map->ops->map_mmap || !IS_ERR_OR_NULL(map->record))
return -ENOTSUPP;
@@ -1055,24 +1059,33 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
err = -EACCES;
goto out;
}
+ bpf_map_write_active_inc(map);
}
+out:
+ mutex_unlock(&map->freeze_mutex);
+ if (err)
+ return err;
/* set default open/close callbacks */
vma->vm_ops = &bpf_map_default_vmops;
vma->vm_private_data = map;
vm_flags_clear(vma, VM_MAYEXEC);
+ /* If mapping is read-only, then disallow potentially re-mapping with
+ * PROT_WRITE by dropping VM_MAYWRITE flag. This VM_MAYWRITE clearing
+ * means that as far as BPF map's memory-mapped VMAs are concerned,
+ * VM_WRITE and VM_MAYWRITE and equivalent, if one of them is set,
+ * both should be set, so we can forget about VM_MAYWRITE and always
+ * check just VM_WRITE
+ */
if (!(vma->vm_flags & VM_WRITE))
- /* disallow re-mapping with PROT_WRITE */
vm_flags_clear(vma, VM_MAYWRITE);
err = map->ops->map_mmap(map, vma);
- if (err)
- goto out;
+ if (err) {
+ if (vma->vm_flags & VM_WRITE)
+ bpf_map_write_active_dec(map);
+ }
- if (vma->vm_flags & VM_MAYWRITE)
- bpf_map_write_active_inc(map);
-out:
- mutex_unlock(&map->freeze_mutex);
return err;
}
@@ -1964,8 +1977,6 @@ int generic_map_update_batch(struct bpf_map *map, struct file *map_file,
return err;
}
-#define MAP_LOOKUP_RETRIES 3
-
int generic_map_lookup_batch(struct bpf_map *map,
const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -1975,8 +1986,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
void __user *values = u64_to_user_ptr(attr->batch.values);
void __user *keys = u64_to_user_ptr(attr->batch.keys);
void *buf, *buf_prevkey, *prev_key, *key, *value;
- int err, retry = MAP_LOOKUP_RETRIES;
u32 value_size, cp, max_count;
+ int err;
if (attr->batch.elem_flags & ~BPF_F_LOCK)
return -EINVAL;
@@ -2022,14 +2033,8 @@ int generic_map_lookup_batch(struct bpf_map *map,
err = bpf_map_copy_value(map, key, value,
attr->batch.elem_flags);
- if (err == -ENOENT) {
- if (retry) {
- retry--;
- continue;
- }
- err = -EINTR;
- break;
- }
+ if (err == -ENOENT)
+ goto next_key;
if (err)
goto free_buf;
@@ -2044,12 +2049,12 @@ int generic_map_lookup_batch(struct bpf_map *map,
goto free_buf;
}
+ cp++;
+next_key:
if (!prev_key)
prev_key = buf_prevkey;
swap(prev_key, key);
- retry = MAP_LOOKUP_RETRIES;
- cp++;
cond_resched();
}
@@ -2730,7 +2735,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
-#define BPF_PROG_LOAD_LAST_FIELD prog_token_fd
+#define BPF_PROG_LOAD_LAST_FIELD fd_array_cnt
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr, u32 uattr_size)
{
@@ -6124,7 +6129,7 @@ static int bpf_unpriv_handler(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table bpf_syscall_table[] = {
+static const struct ctl_table bpf_syscall_table[] = {
{
.procname = "unprivileged_bpf_disabled",
.data = &sysctl_unprivileged_bpf_disabled,
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index fedb54c94cdb8..81d6cf90584a7 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -12,24 +12,16 @@
extern char __start_BTF[];
extern char __stop_BTF[];
-static ssize_t
-btf_vmlinux_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- memcpy(buf, __start_BTF + off, len);
- return len;
-}
-
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
- .read = btf_vmlinux_read,
+ .read_new = sysfs_bin_attr_simple_read,
};
struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
+ bin_attr_btf_vmlinux.private = __start_BTF;
bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
if (bin_attr_btf_vmlinux.size == 0)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f7f892a52a374..60611df77957a 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -196,7 +196,8 @@ struct bpf_verifier_stack_elem {
#define BPF_PRIV_STACK_MIN_SIZE 64
-static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx);
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env);
@@ -286,6 +287,7 @@ struct bpf_call_arg_meta {
u32 ret_btf_id;
u32 subprogno;
struct btf_field *kptr_field;
+ s64 const_map_key;
};
struct bpf_kfunc_call_arg_meta {
@@ -641,6 +643,11 @@ static int iter_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
return stack_slot_obj_get_spi(env, reg, "iter", nr_slots);
}
+static int irq_flag_get_spi(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ return stack_slot_obj_get_spi(env, reg, "irq_flag", 1);
+}
+
static enum bpf_dynptr_type arg_to_dynptr_type(enum bpf_arg_type arg_type)
{
switch (arg_type & DYNPTR_TYPE_FLAG_MASK) {
@@ -752,7 +759,7 @@ static int mark_stack_slots_dynptr(struct bpf_verifier_env *env, struct bpf_reg_
if (clone_ref_obj_id)
id = clone_ref_obj_id;
else
- id = acquire_reference_state(env, insn_idx);
+ id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -1014,7 +1021,7 @@ static int mark_stack_slots_iter(struct bpf_verifier_env *env,
if (spi < 0)
return spi;
- id = acquire_reference_state(env, insn_idx);
+ id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -1136,10 +1143,136 @@ static int is_iter_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_s
return 0;
}
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx);
+static int release_irq_state(struct bpf_verifier_state *state, int id);
+
+static int mark_stack_slot_irq_flag(struct bpf_verifier_env *env,
+ struct bpf_kfunc_call_arg_meta *meta,
+ struct bpf_reg_state *reg, int insn_idx)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, id;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ id = acquire_irq_state(env, insn_idx);
+ if (id < 0)
+ return id;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ __mark_reg_known_zero(st);
+ st->type = PTR_TO_STACK; /* we don't have dedicated reg type */
+ st->live |= REG_LIVE_WRITTEN;
+ st->ref_obj_id = id;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_IRQ_FLAG;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static int unmark_stack_slot_irq_flag(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i, err;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ err = release_irq_state(env->cur_state, st->ref_obj_id);
+ WARN_ON_ONCE(err && err != -EACCES);
+ if (err) {
+ int insn_idx = 0;
+
+ for (int i = 0; i < env->cur_state->acquired_refs; i++) {
+ if (env->cur_state->refs[i].id == env->cur_state->active_irq_id) {
+ insn_idx = env->cur_state->refs[i].insn_idx;
+ break;
+ }
+ }
+
+ verbose(env, "cannot restore irq state out of order, expected id=%d acquired at insn_idx=%d\n",
+ env->cur_state->active_irq_id, insn_idx);
+ return err;
+ }
+
+ __mark_reg_not_init(env, st);
+
+ /* see unmark_stack_slots_dynptr() for why we need to set REG_LIVE_WRITTEN */
+ st->live |= REG_LIVE_WRITTEN;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ slot->slot_type[i] = STACK_INVALID;
+
+ mark_stack_slot_scratched(env, spi);
+ return 0;
+}
+
+static bool is_irq_flag_reg_valid_uninit(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ int spi, i;
+
+ /* For -ERANGE (i.e. spi not falling into allocated stack slots), we
+ * will do check_mem_access to check and update stack bounds later, so
+ * return true for that case.
+ */
+ spi = irq_flag_get_spi(env, reg);
+ if (spi == -ERANGE)
+ return true;
+ if (spi < 0)
+ return false;
+
+ slot = &state->stack[spi];
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] == STACK_IRQ_FLAG)
+ return false;
+ return true;
+}
+
+static int is_irq_flag_reg_valid_init(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ struct bpf_func_state *state = func(env, reg);
+ struct bpf_stack_state *slot;
+ struct bpf_reg_state *st;
+ int spi, i;
+
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return -EINVAL;
+
+ slot = &state->stack[spi];
+ st = &slot->spilled_ptr;
+
+ if (!st->ref_obj_id)
+ return -EINVAL;
+
+ for (i = 0; i < BPF_REG_SIZE; i++)
+ if (slot->slot_type[i] != STACK_IRQ_FLAG)
+ return -EINVAL;
+ return 0;
+}
+
/* Check if given stack slot is "special":
* - spilled register state (STACK_SPILL);
* - dynptr state (STACK_DYNPTR);
* - iter state (STACK_ITER).
+ * - irq flag state (STACK_IRQ_FLAG)
*/
static bool is_stack_slot_special(const struct bpf_stack_state *stack)
{
@@ -1149,6 +1282,7 @@ static bool is_stack_slot_special(const struct bpf_stack_state *stack)
case STACK_SPILL:
case STACK_DYNPTR:
case STACK_ITER:
+ case STACK_IRQ_FLAG:
return true;
case STACK_INVALID:
case STACK_MISC:
@@ -1263,15 +1397,18 @@ out:
return arr ? arr : ZERO_SIZE_PTR;
}
-static int copy_reference_state(struct bpf_func_state *dst, const struct bpf_func_state *src)
+static int copy_reference_state(struct bpf_verifier_state *dst, const struct bpf_verifier_state *src)
{
dst->refs = copy_array(dst->refs, src->refs, src->acquired_refs,
sizeof(struct bpf_reference_state), GFP_KERNEL);
if (!dst->refs)
return -ENOMEM;
- dst->active_locks = src->active_locks;
dst->acquired_refs = src->acquired_refs;
+ dst->active_locks = src->active_locks;
+ dst->active_preempt_locks = src->active_preempt_locks;
+ dst->active_rcu_lock = src->active_rcu_lock;
+ dst->active_irq_id = src->active_irq_id;
return 0;
}
@@ -1288,7 +1425,7 @@ static int copy_stack_state(struct bpf_func_state *dst, const struct bpf_func_st
return 0;
}
-static int resize_reference_state(struct bpf_func_state *state, size_t n)
+static int resize_reference_state(struct bpf_verifier_state *state, size_t n)
{
state->refs = realloc_array(state->refs, state->acquired_refs, n,
sizeof(struct bpf_reference_state));
@@ -1331,94 +1468,130 @@ static int grow_stack_state(struct bpf_verifier_env *env, struct bpf_func_state
* On success, returns a valid pointer id to associate with the register
* On failure, returns a negative errno.
*/
-static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
+static struct bpf_reference_state *acquire_reference_state(struct bpf_verifier_env *env, int insn_idx)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
int new_ofs = state->acquired_refs;
- int id, err;
+ int err;
err = resize_reference_state(state, state->acquired_refs + 1);
if (err)
- return err;
- id = ++env->id_gen;
- state->refs[new_ofs].type = REF_TYPE_PTR;
- state->refs[new_ofs].id = id;
+ return NULL;
state->refs[new_ofs].insn_idx = insn_idx;
- return id;
+ return &state->refs[new_ofs];
+}
+
+static int acquire_reference(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_PTR;
+ s->id = ++env->id_gen;
+ return s->id;
}
static int acquire_lock_state(struct bpf_verifier_env *env, int insn_idx, enum ref_state_type type,
int id, void *ptr)
{
- struct bpf_func_state *state = cur_func(env);
- int new_ofs = state->acquired_refs;
- int err;
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
- err = resize_reference_state(state, state->acquired_refs + 1);
- if (err)
- return err;
- state->refs[new_ofs].type = type;
- state->refs[new_ofs].id = id;
- state->refs[new_ofs].insn_idx = insn_idx;
- state->refs[new_ofs].ptr = ptr;
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = type;
+ s->id = id;
+ s->ptr = ptr;
state->active_locks++;
return 0;
}
-/* release function corresponding to acquire_reference_state(). Idempotent. */
-static int release_reference_state(struct bpf_func_state *state, int ptr_id)
+static int acquire_irq_state(struct bpf_verifier_env *env, int insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reference_state *s;
+
+ s = acquire_reference_state(env, insn_idx);
+ if (!s)
+ return -ENOMEM;
+ s->type = REF_TYPE_IRQ;
+ s->id = ++env->id_gen;
+
+ state->active_irq_id = s->id;
+ return s->id;
+}
+
+static void release_reference_state(struct bpf_verifier_state *state, int idx)
{
- int i, last_idx;
+ int last_idx;
+ size_t rem;
+ /* IRQ state requires the relative ordering of elements remaining the
+ * same, since it relies on the refs array to behave as a stack, so that
+ * it can detect out-of-order IRQ restore. Hence use memmove to shift
+ * the array instead of swapping the final element into the deleted idx.
+ */
last_idx = state->acquired_refs - 1;
+ rem = state->acquired_refs - idx - 1;
+ if (last_idx && idx != last_idx)
+ memmove(&state->refs[idx], &state->refs[idx + 1], sizeof(*state->refs) * rem);
+ memset(&state->refs[last_idx], 0, sizeof(*state->refs));
+ state->acquired_refs--;
+ return;
+}
+
+static int release_lock_state(struct bpf_verifier_state *state, int type, int id, void *ptr)
+{
+ int i;
+
for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].type != REF_TYPE_PTR)
+ if (state->refs[i].type != type)
continue;
- if (state->refs[i].id == ptr_id) {
- if (last_idx && i != last_idx)
- memcpy(&state->refs[i], &state->refs[last_idx],
- sizeof(*state->refs));
- memset(&state->refs[last_idx], 0, sizeof(*state->refs));
- state->acquired_refs--;
+ if (state->refs[i].id == id && state->refs[i].ptr == ptr) {
+ release_reference_state(state, i);
+ state->active_locks--;
return 0;
}
}
return -EINVAL;
}
-static int release_lock_state(struct bpf_func_state *state, int type, int id, void *ptr)
+static int release_irq_state(struct bpf_verifier_state *state, int id)
{
- int i, last_idx;
+ u32 prev_id = 0;
+ int i;
+
+ if (id != state->active_irq_id)
+ return -EACCES;
- last_idx = state->acquired_refs - 1;
for (i = 0; i < state->acquired_refs; i++) {
- if (state->refs[i].type != type)
+ if (state->refs[i].type != REF_TYPE_IRQ)
continue;
- if (state->refs[i].id == id && state->refs[i].ptr == ptr) {
- if (last_idx && i != last_idx)
- memcpy(&state->refs[i], &state->refs[last_idx],
- sizeof(*state->refs));
- memset(&state->refs[last_idx], 0, sizeof(*state->refs));
- state->acquired_refs--;
- state->active_locks--;
+ if (state->refs[i].id == id) {
+ release_reference_state(state, i);
+ state->active_irq_id = prev_id;
return 0;
+ } else {
+ prev_id = state->refs[i].id;
}
}
return -EINVAL;
}
-static struct bpf_reference_state *find_lock_state(struct bpf_verifier_env *env, enum ref_state_type type,
+static struct bpf_reference_state *find_lock_state(struct bpf_verifier_state *state, enum ref_state_type type,
int id, void *ptr)
{
- struct bpf_func_state *state = cur_func(env);
int i;
for (i = 0; i < state->acquired_refs; i++) {
struct bpf_reference_state *s = &state->refs[i];
- if (s->type == REF_TYPE_PTR || s->type != type)
+ if (s->type != type)
continue;
if (s->id == id && s->ptr == ptr)
@@ -1431,7 +1604,6 @@ static void free_func_state(struct bpf_func_state *state)
{
if (!state)
return;
- kfree(state->refs);
kfree(state->stack);
kfree(state);
}
@@ -1445,6 +1617,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
free_func_state(state->frame[i]);
state->frame[i] = NULL;
}
+ kfree(state->refs);
if (free_self)
kfree(state);
}
@@ -1455,12 +1628,7 @@ static void free_verifier_state(struct bpf_verifier_state *state,
static int copy_func_state(struct bpf_func_state *dst,
const struct bpf_func_state *src)
{
- int err;
-
- memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs));
- err = copy_reference_state(dst, src);
- if (err)
- return err;
+ memcpy(dst, src, offsetof(struct bpf_func_state, stack));
return copy_stack_state(dst, src);
}
@@ -1477,9 +1645,10 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
free_func_state(dst_state->frame[i]);
dst_state->frame[i] = NULL;
}
+ err = copy_reference_state(dst_state, src);
+ if (err)
+ return err;
dst_state->speculative = src->speculative;
- dst_state->active_rcu_lock = src->active_rcu_lock;
- dst_state->active_preempt_lock = src->active_preempt_lock;
dst_state->in_sleepable = src->in_sleepable;
dst_state->curframe = src->curframe;
dst_state->branches = src->branches;
@@ -3206,10 +3375,27 @@ static int mark_reg_read(struct bpf_verifier_env *env,
return 0;
}
-static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+static int mark_stack_slot_obj_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
+ int spi, int nr_slots)
{
struct bpf_func_state *state = func(env, reg);
- int spi, ret;
+ int err, i;
+
+ for (i = 0; i < nr_slots; i++) {
+ struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
+
+ err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
+ if (err)
+ return err;
+
+ mark_stack_slot_scratched(env, spi - i);
+ }
+ return 0;
+}
+
+static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
/* For CONST_PTR_TO_DYNPTR, it must have already been done by
* check_reg_arg in check_helper_call and mark_btf_func_reg_size in
@@ -3224,31 +3410,23 @@ static int mark_dynptr_read(struct bpf_verifier_env *env, struct bpf_reg_state *
* bounds and spi is the first dynptr slot. Simply mark stack slot as
* read.
*/
- ret = mark_reg_read(env, &state->stack[spi].spilled_ptr,
- state->stack[spi].spilled_ptr.parent, REG_LIVE_READ64);
- if (ret)
- return ret;
- return mark_reg_read(env, &state->stack[spi - 1].spilled_ptr,
- state->stack[spi - 1].spilled_ptr.parent, REG_LIVE_READ64);
+ return mark_stack_slot_obj_read(env, reg, spi, BPF_DYNPTR_NR_SLOTS);
}
static int mark_iter_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
int spi, int nr_slots)
{
- struct bpf_func_state *state = func(env, reg);
- int err, i;
-
- for (i = 0; i < nr_slots; i++) {
- struct bpf_reg_state *st = &state->stack[spi - i].spilled_ptr;
-
- err = mark_reg_read(env, st, st->parent, REG_LIVE_READ64);
- if (err)
- return err;
+ return mark_stack_slot_obj_read(env, reg, spi, nr_slots);
+}
- mark_stack_slot_scratched(env, spi - i);
- }
+static int mark_irq_flag_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
+{
+ int spi;
- return 0;
+ spi = irq_flag_get_spi(env, reg);
+ if (spi < 0)
+ return spi;
+ return mark_stack_slot_obj_read(env, reg, spi, 1);
}
/* This function is supposed to be used by the following 32-bit optimization
@@ -4503,7 +4681,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno)
fmt_stack_mask(env->tmp_str_buf, TMP_STR_BUF_LEN,
bt_frame_stack_mask(bt, fr));
verbose(env, "stack=%s: ", env->tmp_str_buf);
- print_verifier_state(env, func, true);
+ print_verifier_state(env, st, fr, true);
}
}
@@ -5128,7 +5306,7 @@ enum bpf_access_src {
static int check_stack_range_initialized(struct bpf_verifier_env *env,
int regno, int off, int access_size,
bool zero_size_allowed,
- enum bpf_access_src type,
+ enum bpf_access_type type,
struct bpf_call_arg_meta *meta);
static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno)
@@ -5161,7 +5339,7 @@ static int check_stack_read_var_off(struct bpf_verifier_env *env,
/* Note that we pass a NULL meta, so raw access will not be permitted.
*/
err = check_stack_range_initialized(env, ptr_regno, off, size,
- false, ACCESS_DIRECT, NULL);
+ false, BPF_READ, NULL);
if (err)
return err;
@@ -5501,13 +5679,15 @@ static bool in_sleepable(struct bpf_verifier_env *env)
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
return env->cur_state->active_rcu_lock ||
- cur_func(env)->active_locks ||
+ env->cur_state->active_locks ||
!in_sleepable(env);
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types)
+#ifdef CONFIG_NET
BTF_ID(struct, prog_test_ref_kfunc)
+#endif
#ifdef CONFIG_CGROUPS
BTF_ID(struct, cgroup)
#endif
@@ -5515,7 +5695,9 @@ BTF_ID(struct, cgroup)
BTF_ID(struct, bpf_cpumask)
#endif
BTF_ID(struct, task_struct)
+#ifdef CONFIG_CRYPTO
BTF_ID(struct, bpf_crypto_ctx)
+#endif
BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
@@ -7011,7 +7193,7 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env,
static int check_stack_access_within_bounds(
struct bpf_verifier_env *env,
int regno, int off, int access_size,
- enum bpf_access_src src, enum bpf_access_type type)
+ enum bpf_access_type type)
{
struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = regs + regno;
@@ -7020,10 +7202,7 @@ static int check_stack_access_within_bounds(
int err;
char *err_extra;
- if (src == ACCESS_HELPER)
- /* We don't know if helpers are reading or writing (or both). */
- err_extra = " indirect access to";
- else if (type == BPF_READ)
+ if (type == BPF_READ)
err_extra = " read from";
else
err_extra = " write to";
@@ -7241,7 +7420,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
} else if (reg->type == PTR_TO_STACK) {
/* Basic bounds checks. */
- err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t);
+ err = check_stack_access_within_bounds(env, regno, off, size, t);
if (err)
return err;
@@ -7461,13 +7640,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i
static int check_stack_range_initialized(
struct bpf_verifier_env *env, int regno, int off,
int access_size, bool zero_size_allowed,
- enum bpf_access_src type, struct bpf_call_arg_meta *meta)
+ enum bpf_access_type type, struct bpf_call_arg_meta *meta)
{
struct bpf_reg_state *reg = reg_state(env, regno);
struct bpf_func_state *state = func(env, reg);
int err, min_off, max_off, i, j, slot, spi;
- char *err_extra = type == ACCESS_HELPER ? " indirect" : "";
- enum bpf_access_type bounds_check_type;
/* Some accesses can write anything into the stack, others are
* read-only.
*/
@@ -7478,18 +7655,10 @@ static int check_stack_range_initialized(
return -EACCES;
}
- if (type == ACCESS_HELPER) {
- /* The bounds checks for writes are more permissive than for
- * reads. However, if raw_mode is not set, we'll do extra
- * checks below.
- */
- bounds_check_type = BPF_WRITE;
+ if (type == BPF_WRITE)
clobber = true;
- } else {
- bounds_check_type = BPF_READ;
- }
- err = check_stack_access_within_bounds(env, regno, off, access_size,
- type, bounds_check_type);
+
+ err = check_stack_access_within_bounds(env, regno, off, access_size, type);
if (err)
return err;
@@ -7506,8 +7675,8 @@ static int check_stack_range_initialized(
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n",
- regno, err_extra, tn_buf);
+ verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n",
+ regno, tn_buf);
return -EACCES;
}
/* Only initialized buffer on stack is allowed to be accessed
@@ -7560,7 +7729,7 @@ static int check_stack_range_initialized(
slot = -i - 1;
spi = slot / BPF_REG_SIZE;
if (state->allocated_stack <= slot) {
- verbose(env, "verifier bug: allocated_stack too small");
+ verbose(env, "verifier bug: allocated_stack too small\n");
return -EFAULT;
}
@@ -7588,14 +7757,14 @@ static int check_stack_range_initialized(
}
if (tnum_is_const(reg->var_off)) {
- verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n",
- err_extra, regno, min_off, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d off %d+%d size %d\n",
+ regno, min_off, i - min_off, access_size);
} else {
char tn_buf[48];
tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n",
- err_extra, regno, tn_buf, i - min_off, access_size);
+ verbose(env, "invalid read from stack R%d var_off %s+%d size %d\n",
+ regno, tn_buf, i - min_off, access_size);
}
return -EACCES;
mark:
@@ -7670,7 +7839,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
return check_stack_range_initialized(
env,
regno, reg->off, access_size,
- zero_size_allowed, ACCESS_HELPER, meta);
+ zero_size_allowed, access_type, meta);
case PTR_TO_BTF_ID:
return check_ptr_to_btf_access(env, regs, regno, reg->off,
access_size, BPF_READ, -1);
@@ -7835,15 +8004,15 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
- * cur_func(env)->active_locks remembers which map value element or allocated
+ * env->cur_state->active_locks remembers which map value element or allocated
* object got locked and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
- struct bpf_func_state *cur = cur_func(env);
u64 val = reg->var_off.value;
struct bpf_map *map = NULL;
struct btf *btf = NULL;
@@ -7910,7 +8079,7 @@ static int process_spin_lock(struct bpf_verifier_env *env, int regno,
return -EINVAL;
}
- if (release_lock_state(cur_func(env), REF_TYPE_LOCK, reg->id, ptr)) {
+ if (release_lock_state(env->cur_state, REF_TYPE_LOCK, reg->id, ptr)) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
@@ -8982,6 +9151,69 @@ static int check_reg_const_str(struct bpf_verifier_env *env,
return 0;
}
+/* Returns constant key value in `value` if possible, else negative error */
+static int get_constant_map_key(struct bpf_verifier_env *env,
+ struct bpf_reg_state *key,
+ u32 key_size,
+ s64 *value)
+{
+ struct bpf_func_state *state = func(env, key);
+ struct bpf_reg_state *reg;
+ int slot, spi, off;
+ int spill_size = 0;
+ int zero_size = 0;
+ int stack_off;
+ int i, err;
+ u8 *stype;
+
+ if (!env->bpf_capable)
+ return -EOPNOTSUPP;
+ if (key->type != PTR_TO_STACK)
+ return -EOPNOTSUPP;
+ if (!tnum_is_const(key->var_off))
+ return -EOPNOTSUPP;
+
+ stack_off = key->off + key->var_off.value;
+ slot = -stack_off - 1;
+ spi = slot / BPF_REG_SIZE;
+ off = slot % BPF_REG_SIZE;
+ stype = state->stack[spi].slot_type;
+
+ /* First handle precisely tracked STACK_ZERO */
+ for (i = off; i >= 0 && stype[i] == STACK_ZERO; i--)
+ zero_size++;
+ if (zero_size >= key_size) {
+ *value = 0;
+ return 0;
+ }
+
+ /* Check that stack contains a scalar spill of expected size */
+ if (!is_spilled_scalar_reg(&state->stack[spi]))
+ return -EOPNOTSUPP;
+ for (i = off; i >= 0 && stype[i] == STACK_SPILL; i--)
+ spill_size++;
+ if (spill_size != key_size)
+ return -EOPNOTSUPP;
+
+ reg = &state->stack[spi].spilled_ptr;
+ if (!tnum_is_const(reg->var_off))
+ /* Stack value not statically known */
+ return -EOPNOTSUPP;
+
+ /* We are relying on a constant value. So mark as precise
+ * to prevent pruning on it.
+ */
+ bt_set_frame_slot(&env->bt, key->frameno, spi);
+ err = mark_chain_precision_batch(env);
+ if (err < 0)
+ return err;
+
+ *value = reg->var_off.value;
+ return 0;
+}
+
+static bool can_elide_value_nullness(enum bpf_map_type type);
+
static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_call_arg_meta *meta,
const struct bpf_func_proto *fn,
@@ -8992,6 +9224,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
u32 *arg_btf_id = NULL;
+ u32 key_size;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -9125,8 +9358,20 @@ skip_type_check:
verbose(env, "invalid map_ptr to access map->key\n");
return -EACCES;
}
- err = check_helper_mem_access(env, regno, meta->map_ptr->key_size,
- BPF_READ, false, NULL);
+ key_size = meta->map_ptr->key_size;
+ err = check_helper_mem_access(env, regno, key_size, BPF_READ, false, NULL);
+ if (err)
+ return err;
+ if (can_elide_value_nullness(meta->map_ptr->map_type)) {
+ err = get_constant_map_key(env, reg, key_size, &meta->const_map_key);
+ if (err < 0) {
+ meta->const_map_key = -1;
+ if (err == -EOPNOTSUPP)
+ err = 0;
+ else
+ return err;
+ }
+ }
break;
case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
@@ -9658,21 +9903,38 @@ static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range
reg->range = AT_PKT_END;
}
+static int release_reference_nomark(struct bpf_verifier_state *state, int ref_obj_id)
+{
+ int i;
+
+ for (i = 0; i < state->acquired_refs; i++) {
+ if (state->refs[i].type != REF_TYPE_PTR)
+ continue;
+ if (state->refs[i].id == ref_obj_id) {
+ release_reference_state(state, i);
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
/* The pointer with the specified id has released its reference to kernel
* resources. Identify all copies of the same pointer and clear the reference.
+ *
+ * This is the release function corresponding to acquire_reference(). Idempotent.
*/
-static int release_reference(struct bpf_verifier_env *env,
- int ref_obj_id)
+static int release_reference(struct bpf_verifier_env *env, int ref_obj_id)
{
+ struct bpf_verifier_state *vstate = env->cur_state;
struct bpf_func_state *state;
struct bpf_reg_state *reg;
int err;
- err = release_reference_state(cur_func(env), ref_obj_id);
+ err = release_reference_nomark(vstate, ref_obj_id);
if (err)
return err;
- bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
+ bpf_for_each_reg_in_vstate(vstate, state, reg, ({
if (reg->ref_obj_id == ref_obj_id)
mark_reg_invalid(env, reg);
}));
@@ -9746,9 +10008,7 @@ static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int calls
callsite,
state->curframe + 1 /* frameno within this callchain */,
subprog /* subprog number within this prog */);
- /* Transfer references to the callee */
- err = copy_reference_state(callee, caller);
- err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+ err = set_callee_state_cb(env, caller, callee, callsite);
if (err)
goto err_out;
@@ -9978,19 +10238,25 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
const char *sub_name = subprog_name(env, subprog);
/* Only global subprogs cannot be called with a lock held. */
- if (cur_func(env)->active_locks) {
+ if (env->cur_state->active_locks) {
verbose(env, "global function calls are not allowed while holding a lock,\n"
"use static function instead\n");
return -EINVAL;
}
/* Only global subprogs cannot be called with preemption disabled. */
- if (env->cur_state->active_preempt_lock) {
+ if (env->cur_state->active_preempt_locks) {
verbose(env, "global function calls are not allowed with preemption disabled,\n"
"use static function instead\n");
return -EINVAL;
}
+ if (env->cur_state->active_irq_id) {
+ verbose(env, "global function calls are not allowed with IRQs disabled,\n"
+ "use static function instead\n");
+ return -EINVAL;
+ }
+
if (err) {
verbose(env, "Caller passes invalid args into func#%d ('%s')\n",
subprog, sub_name);
@@ -10027,9 +10293,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "caller:\n");
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
verbose(env, "callee:\n");
- print_verifier_state(env, state->frame[state->curframe], true);
+ print_verifier_state(env, state, state->curframe, true);
}
return 0;
@@ -10321,11 +10587,6 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
caller->regs[BPF_REG_0] = *r0;
}
- /* Transfer references to the caller */
- err = copy_reference_state(caller, callee);
- if (err)
- return err;
-
/* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
* there function call logic would reschedule callback visit. If iteration
* converges is_state_visited() would prune that visit eventually.
@@ -10338,9 +10599,9 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state, callee->frameno, true);
verbose(env, "to caller at %d:\n", *insn_idx);
- print_verifier_state(env, caller, true);
+ print_verifier_state(env, state, caller->frameno, true);
}
/* clear everything in the callee. In case of exceptional exits using
* bpf_throw, this will be done by copy_verifier_state for extra frames. */
@@ -10490,11 +10751,11 @@ record_func_key(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta,
static int check_reference_leak(struct bpf_verifier_env *env, bool exception_exit)
{
- struct bpf_func_state *state = cur_func(env);
+ struct bpf_verifier_state *state = env->cur_state;
bool refs_lingering = false;
int i;
- if (!exception_exit && state->frameno)
+ if (!exception_exit && cur_func(env)->frameno)
return 0;
for (i = 0; i < state->acquired_refs; i++) {
@@ -10511,7 +10772,7 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit
{
int err;
- if (check_lock && cur_func(env)->active_locks) {
+ if (check_lock && env->cur_state->active_locks) {
verbose(env, "%s cannot be used inside bpf_spin_lock-ed region\n", prefix);
return -EINVAL;
}
@@ -10522,12 +10783,17 @@ static int check_resource_leak(struct bpf_verifier_env *env, bool exception_exit
return err;
}
+ if (check_lock && env->cur_state->active_irq_id) {
+ verbose(env, "%s cannot be used inside bpf_local_irq_save-ed region\n", prefix);
+ return -EINVAL;
+ }
+
if (check_lock && env->cur_state->active_rcu_lock) {
verbose(env, "%s cannot be used inside bpf_rcu_read_lock-ed region\n", prefix);
return -EINVAL;
}
- if (check_lock && env->cur_state->active_preempt_lock) {
+ if (check_lock && env->cur_state->active_preempt_locks) {
verbose(env, "%s cannot be used inside bpf_preempt_disable-ed region\n", prefix);
return -EINVAL;
}
@@ -10629,6 +10895,21 @@ static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno
state->callback_subprogno == subprogno);
}
+/* Returns whether or not the given map type can potentially elide
+ * lookup return value nullness check. This is possible if the key
+ * is statically known.
+ */
+static bool can_elide_value_nullness(enum bpf_map_type type)
+{
+ switch (type) {
+ case BPF_MAP_TYPE_ARRAY:
+ case BPF_MAP_TYPE_PERCPU_ARRAY:
+ return true;
+ default:
+ return false;
+ }
+}
+
static int get_helper_proto(struct bpf_verifier_env *env, int func_id,
const struct bpf_func_proto **ptr)
{
@@ -10715,7 +10996,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
- if (env->cur_state->active_preempt_lock) {
+ if (env->cur_state->active_preempt_locks) {
if (fn->might_sleep) {
verbose(env, "sleepable helper %s#%d in non-preemptible region\n",
func_id_name(func_id), func_id);
@@ -10726,6 +11007,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
}
+ if (env->cur_state->active_irq_id) {
+ if (fn->might_sleep) {
+ verbose(env, "sleepable helper %s#%d in IRQ-disabled region\n",
+ func_id_name(func_id), func_id);
+ return -EINVAL;
+ }
+
+ if (in_sleepable(env) && is_storage_get_function(func_id))
+ env->insn_aux_data[insn_idx].storage_get_func_atomic = true;
+ }
+
meta.func_id = func_id;
/* check args */
for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
@@ -10772,7 +11064,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
struct bpf_func_state *state;
struct bpf_reg_state *reg;
- err = release_reference_state(cur_func(env), ref_obj_id);
+ err = release_reference_nomark(env->cur_state, ref_obj_id);
if (!err) {
bpf_for_each_reg_in_vstate(env->cur_state, state, reg, ({
if (reg->ref_obj_id == ref_obj_id) {
@@ -10984,10 +11276,17 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
"kernel subsystem misconfigured verifier\n");
return -EINVAL;
}
+
+ if (func_id == BPF_FUNC_map_lookup_elem &&
+ can_elide_value_nullness(meta.map_ptr->map_type) &&
+ meta.const_map_key >= 0 &&
+ meta.const_map_key < meta.map_ptr->max_entries)
+ ret_flag &= ~PTR_MAYBE_NULL;
+
regs[BPF_REG_0].map_ptr = meta.map_ptr;
regs[BPF_REG_0].map_uid = meta.map_uid;
regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
- if (!type_may_be_null(ret_type) &&
+ if (!type_may_be_null(ret_flag) &&
btf_record_has_field(meta.map_ptr->record, BPF_SPIN_LOCK)) {
regs[BPF_REG_0].id = ++env->id_gen;
}
@@ -11105,7 +11404,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* For release_reference() */
regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id;
} else if (is_acquire_function(func_id, meta.map_ptr)) {
- int id = acquire_reference_state(env, insn_idx);
+ int id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -11287,6 +11586,11 @@ static bool is_kfunc_arg_const_str(const struct btf *btf, const struct btf_param
return btf_param_match_suffix(btf, arg, "__str");
}
+static bool is_kfunc_arg_irq_flag(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__irq_flag");
+}
+
static bool is_kfunc_arg_scalar_with_name(const struct btf *btf,
const struct btf_param *arg,
const char *name)
@@ -11440,6 +11744,7 @@ enum kfunc_ptr_arg_type {
KF_ARG_PTR_TO_CONST_STR,
KF_ARG_PTR_TO_MAP,
KF_ARG_PTR_TO_WORKQUEUE,
+ KF_ARG_PTR_TO_IRQ_FLAG,
};
enum special_kfunc_type {
@@ -11471,6 +11776,11 @@ enum special_kfunc_type {
KF_bpf_iter_css_task_new,
KF_bpf_session_cookie,
KF_bpf_get_kmem_cache,
+ KF_bpf_local_irq_save,
+ KF_bpf_local_irq_restore,
+ KF_bpf_iter_num_new,
+ KF_bpf_iter_num_next,
+ KF_bpf_iter_num_destroy,
};
BTF_SET_START(special_kfunc_set)
@@ -11486,8 +11796,10 @@ BTF_ID(func, bpf_rdonly_cast)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
+#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
@@ -11515,8 +11827,13 @@ BTF_ID(func, bpf_rcu_read_unlock)
BTF_ID(func, bpf_rbtree_remove)
BTF_ID(func, bpf_rbtree_add_impl)
BTF_ID(func, bpf_rbtree_first)
+#ifdef CONFIG_NET
BTF_ID(func, bpf_dynptr_from_skb)
BTF_ID(func, bpf_dynptr_from_xdp)
+#else
+BTF_ID_UNUSED
+BTF_ID_UNUSED
+#endif
BTF_ID(func, bpf_dynptr_slice)
BTF_ID(func, bpf_dynptr_slice_rdwr)
BTF_ID(func, bpf_dynptr_clone)
@@ -11537,6 +11854,11 @@ BTF_ID(func, bpf_session_cookie)
BTF_ID_UNUSED
#endif
BTF_ID(func, bpf_get_kmem_cache)
+BTF_ID(func, bpf_local_irq_save)
+BTF_ID(func, bpf_local_irq_restore)
+BTF_ID(func, bpf_iter_num_new)
+BTF_ID(func, bpf_iter_num_next)
+BTF_ID(func, bpf_iter_num_destroy)
static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta)
{
@@ -11627,6 +11949,9 @@ get_kfunc_ptr_arg_type(struct bpf_verifier_env *env,
if (is_kfunc_arg_wq(meta->btf, &args[argno]))
return KF_ARG_PTR_TO_WORKQUEUE;
+ if (is_kfunc_arg_irq_flag(meta->btf, &args[argno]))
+ return KF_ARG_PTR_TO_IRQ_FLAG;
+
if ((base_type(reg->type) == PTR_TO_BTF_ID || reg2btf_ids[base_type(reg->type)])) {
if (!btf_type_is_struct(ref_t)) {
verbose(env, "kernel function %s args#%d pointer type %s %s is not supported\n",
@@ -11730,11 +12055,59 @@ static int process_kf_arg_ptr_to_btf_id(struct bpf_verifier_env *env,
return 0;
}
+static int process_irq_flag(struct bpf_verifier_env *env, int regno,
+ struct bpf_kfunc_call_arg_meta *meta)
+{
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
+ bool irq_save;
+ int err;
+
+ if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_save]) {
+ irq_save = true;
+ } else if (meta->func_id == special_kfunc_list[KF_bpf_local_irq_restore]) {
+ irq_save = false;
+ } else {
+ verbose(env, "verifier internal error: unknown irq flags kfunc\n");
+ return -EFAULT;
+ }
+
+ if (irq_save) {
+ if (!is_irq_flag_reg_valid_uninit(env, reg)) {
+ verbose(env, "expected uninitialized irq flag as arg#%d\n", regno - 1);
+ return -EINVAL;
+ }
+
+ err = check_mem_access(env, env->insn_idx, regno, 0, BPF_DW, BPF_WRITE, -1, false, false);
+ if (err)
+ return err;
+
+ err = mark_stack_slot_irq_flag(env, meta, reg, env->insn_idx);
+ if (err)
+ return err;
+ } else {
+ err = is_irq_flag_reg_valid_init(env, reg);
+ if (err) {
+ verbose(env, "expected an initialized irq flag as arg#%d\n", regno - 1);
+ return err;
+ }
+
+ err = mark_irq_flag_read(env, reg);
+ if (err)
+ return err;
+
+ err = unmark_stack_slot_irq_flag(env, reg);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+
static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state *reg)
{
struct btf_record *rec = reg_btf_record(reg);
- if (!cur_func(env)->active_locks) {
+ if (!env->cur_state->active_locks) {
verbose(env, "verifier internal error: ref_set_non_owning w/o active lock\n");
return -EFAULT;
}
@@ -11753,12 +12126,11 @@ static int ref_set_non_owning(struct bpf_verifier_env *env, struct bpf_reg_state
static int ref_convert_owning_non_owning(struct bpf_verifier_env *env, u32 ref_obj_id)
{
- struct bpf_func_state *state, *unused;
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *unused;
struct bpf_reg_state *reg;
int i;
- state = cur_func(env);
-
if (!ref_obj_id) {
verbose(env, "verifier internal error: ref_obj_id is zero for "
"owning -> non-owning conversion\n");
@@ -11848,9 +12220,9 @@ static int check_reg_allocation_locked(struct bpf_verifier_env *env, struct bpf_
}
id = reg->id;
- if (!cur_func(env)->active_locks)
+ if (!env->cur_state->active_locks)
return -EINVAL;
- s = find_lock_state(env, REF_TYPE_LOCK, id, ptr);
+ s = find_lock_state(env->cur_state, REF_TYPE_LOCK, id, ptr);
if (!s) {
verbose(env, "held lock and object are not in the same allocation\n");
return -EINVAL;
@@ -11873,12 +12245,24 @@ static bool is_bpf_rbtree_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_rbtree_first];
}
+static bool is_bpf_iter_num_api_kfunc(u32 btf_id)
+{
+ return btf_id == special_kfunc_list[KF_bpf_iter_num_new] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_next] ||
+ btf_id == special_kfunc_list[KF_bpf_iter_num_destroy];
+}
+
static bool is_bpf_graph_api_kfunc(u32 btf_id)
{
return is_bpf_list_api_kfunc(btf_id) || is_bpf_rbtree_api_kfunc(btf_id) ||
btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
+static bool kfunc_spin_allowed(u32 btf_id)
+{
+ return is_bpf_graph_api_kfunc(btf_id) || is_bpf_iter_num_api_kfunc(btf_id);
+}
+
static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
@@ -12307,6 +12691,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
case KF_ARG_PTR_TO_REFCOUNTED_KPTR:
case KF_ARG_PTR_TO_CONST_STR:
case KF_ARG_PTR_TO_WORKQUEUE:
+ case KF_ARG_PTR_TO_IRQ_FLAG:
break;
default:
WARN_ON_ONCE(1);
@@ -12596,6 +12981,15 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
if (ret < 0)
return ret;
break;
+ case KF_ARG_PTR_TO_IRQ_FLAG:
+ if (reg->type != PTR_TO_STACK) {
+ verbose(env, "arg#%d doesn't point to an irq flag on stack\n", i);
+ return -EINVAL;
+ }
+ ret = process_irq_flag(env, regno, meta);
+ if (ret < 0)
+ return ret;
+ break;
}
}
@@ -12760,22 +13154,27 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
- if (env->cur_state->active_preempt_lock) {
+ if (env->cur_state->active_preempt_locks) {
if (preempt_disable) {
- env->cur_state->active_preempt_lock++;
+ env->cur_state->active_preempt_locks++;
} else if (preempt_enable) {
- env->cur_state->active_preempt_lock--;
+ env->cur_state->active_preempt_locks--;
} else if (sleepable) {
verbose(env, "kernel func %s is sleepable within non-preemptible region\n", func_name);
return -EACCES;
}
} else if (preempt_disable) {
- env->cur_state->active_preempt_lock++;
+ env->cur_state->active_preempt_locks++;
} else if (preempt_enable) {
verbose(env, "unmatched attempt to enable preemption (kernel function %s)\n", func_name);
return -EINVAL;
}
+ if (env->cur_state->active_irq_id && sleepable) {
+ verbose(env, "kernel func %s is sleepable within IRQ-disabled region\n", func_name);
+ return -EACCES;
+ }
+
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -13069,7 +13468,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
mark_btf_func_reg_size(env, BPF_REG_0, sizeof(void *));
if (is_kfunc_acquire(&meta)) {
- int id = acquire_reference_state(env, insn_idx);
+ int id = acquire_reference(env, insn_idx);
if (id < 0)
return id;
@@ -13794,64 +14193,56 @@ static void scalar_min_max_sub(struct bpf_reg_state *dst_reg,
static void scalar32_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s32 smin_val = src_reg->s32_min_value;
- u32 umin_val = src_reg->u32_min_value;
- u32 umax_val = src_reg->u32_max_value;
+ s32 *dst_smin = &dst_reg->s32_min_value;
+ s32 *dst_smax = &dst_reg->s32_max_value;
+ u32 *dst_umin = &dst_reg->u32_min_value;
+ u32 *dst_umax = &dst_reg->u32_max_value;
+ s32 tmp_prod[4];
- if (smin_val < 0 || dst_reg->s32_min_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg32_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S32_MAX).
- */
- if (umax_val > U16_MAX || dst_reg->u32_max_value > U16_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg32_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->u32_max_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->u32_min_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U32_MAX;
}
- dst_reg->u32_min_value *= umin_val;
- dst_reg->u32_max_value *= umax_val;
- if (dst_reg->u32_max_value > S32_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->s32_min_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->s32_max_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_min_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->s32_max_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->s32_min_value = S32_MIN;
- dst_reg->s32_max_value = S32_MAX;
+ *dst_smin = S32_MIN;
+ *dst_smax = S32_MAX;
} else {
- dst_reg->s32_min_value = dst_reg->u32_min_value;
- dst_reg->s32_max_value = dst_reg->u32_max_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
static void scalar_min_max_mul(struct bpf_reg_state *dst_reg,
struct bpf_reg_state *src_reg)
{
- s64 smin_val = src_reg->smin_value;
- u64 umin_val = src_reg->umin_value;
- u64 umax_val = src_reg->umax_value;
+ s64 *dst_smin = &dst_reg->smin_value;
+ s64 *dst_smax = &dst_reg->smax_value;
+ u64 *dst_umin = &dst_reg->umin_value;
+ u64 *dst_umax = &dst_reg->umax_value;
+ s64 tmp_prod[4];
- if (smin_val < 0 || dst_reg->smin_value < 0) {
- /* Ain't nobody got time to multiply that sign */
- __mark_reg64_unbounded(dst_reg);
- return;
- }
- /* Both values are positive, so we can work with unsigned and
- * copy the result to signed (unless it exceeds S64_MAX).
- */
- if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) {
- /* Potential overflow, we know nothing */
- __mark_reg64_unbounded(dst_reg);
- return;
+ if (check_mul_overflow(*dst_umax, src_reg->umax_value, dst_umax) ||
+ check_mul_overflow(*dst_umin, src_reg->umin_value, dst_umin)) {
+ /* Overflow possible, we know nothing */
+ *dst_umin = 0;
+ *dst_umax = U64_MAX;
}
- dst_reg->umin_value *= umin_val;
- dst_reg->umax_value *= umax_val;
- if (dst_reg->umax_value > S64_MAX) {
+ if (check_mul_overflow(*dst_smin, src_reg->smin_value, &tmp_prod[0]) ||
+ check_mul_overflow(*dst_smin, src_reg->smax_value, &tmp_prod[1]) ||
+ check_mul_overflow(*dst_smax, src_reg->smin_value, &tmp_prod[2]) ||
+ check_mul_overflow(*dst_smax, src_reg->smax_value, &tmp_prod[3])) {
/* Overflow possible, we know nothing */
- dst_reg->smin_value = S64_MIN;
- dst_reg->smax_value = S64_MAX;
+ *dst_smin = S64_MIN;
+ *dst_smax = S64_MAX;
} else {
- dst_reg->smin_value = dst_reg->umin_value;
- dst_reg->smax_value = dst_reg->umax_value;
+ *dst_smin = min_array(tmp_prod, 4);
+ *dst_smax = max_array(tmp_prod, 4);
}
}
@@ -14462,12 +14853,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(env, state, true);
+ print_verifier_state(env, vstate, vstate->curframe, true);
verbose(env, "verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -15365,7 +15756,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno,
* No one could have freed the reference state before
* doing the NULL check.
*/
- WARN_ON_ONCE(release_reference_state(state, id));
+ WARN_ON_ONCE(release_reference_nomark(vstate, id));
bpf_for_each_reg_in_vstate(vstate, state, reg, ({
mark_ptr_or_null_reg(state, reg, id, is_null);
@@ -15596,9 +15987,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (insn->code != (BPF_JMP | BPF_JCOND) ||
insn->src_reg != BPF_MAY_GOTO ||
- insn->dst_reg || insn->imm || insn->off == 0) {
- verbose(env, "invalid may_goto off %d imm %d\n",
- insn->off, insn->imm);
+ insn->dst_reg || insn->imm) {
+ verbose(env, "invalid may_goto imm %d\n", insn->imm);
return -EINVAL;
}
prev_st = find_prev_entry(env, cur_st->parent, idx);
@@ -15675,7 +16065,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx))
return -EFAULT;
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
*insn_idx += insn->off;
return 0;
} else if (pred == 0) {
@@ -15689,7 +16079,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
*insn_idx))
return -EFAULT;
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
@@ -15806,7 +16196,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return -EACCES;
}
if (env->log.level & BPF_LOG_LEVEL)
- print_insn_state(env, this_branch->frame[this_branch->curframe]);
+ print_insn_state(env, this_branch, this_branch->curframe);
return 0;
}
@@ -17734,6 +18124,12 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
return false;
break;
+ case STACK_IRQ_FLAG:
+ old_reg = &old->stack[spi].spilled_ptr;
+ cur_reg = &cur->stack[spi].spilled_ptr;
+ if (!check_ids(old_reg->ref_obj_id, cur_reg->ref_obj_id, idmap))
+ return false;
+ break;
case STACK_MISC:
case STACK_ZERO:
case STACK_INVALID:
@@ -17746,7 +18142,7 @@ static bool stacksafe(struct bpf_verifier_env *env, struct bpf_func_state *old,
return true;
}
-static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
+static bool refsafe(struct bpf_verifier_state *old, struct bpf_verifier_state *cur,
struct bpf_idmap *idmap)
{
int i;
@@ -17754,12 +18150,25 @@ static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur,
if (old->acquired_refs != cur->acquired_refs)
return false;
+ if (old->active_locks != cur->active_locks)
+ return false;
+
+ if (old->active_preempt_locks != cur->active_preempt_locks)
+ return false;
+
+ if (old->active_rcu_lock != cur->active_rcu_lock)
+ return false;
+
+ if (!check_ids(old->active_irq_id, cur->active_irq_id, idmap))
+ return false;
+
for (i = 0; i < old->acquired_refs; i++) {
if (!check_ids(old->refs[i].id, cur->refs[i].id, idmap) ||
old->refs[i].type != cur->refs[i].type)
return false;
switch (old->refs[i].type) {
case REF_TYPE_PTR:
+ case REF_TYPE_IRQ:
break;
case REF_TYPE_LOCK:
if (old->refs[i].ptr != cur->refs[i].ptr)
@@ -17816,9 +18225,6 @@ static bool func_states_equal(struct bpf_verifier_env *env, struct bpf_func_stat
if (!stacksafe(env, old, cur, &env->idmap_scratch, exact))
return false;
- if (!refsafe(old, cur, &env->idmap_scratch))
- return false;
-
return true;
}
@@ -17846,13 +18252,10 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative)
return false;
- if (old->active_rcu_lock != cur->active_rcu_lock)
- return false;
-
- if (old->active_preempt_lock != cur->active_preempt_lock)
+ if (old->in_sleepable != cur->in_sleepable)
return false;
- if (old->in_sleepable != cur->in_sleepable)
+ if (!refsafe(old, cur, &env->idmap_scratch))
return false;
/* for states to be equal callsites have to be the same
@@ -18245,9 +18648,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
verbose(env, "cur state:");
- print_verifier_state(env, cur->frame[cur->curframe], true);
+ print_verifier_state(env, cur, cur->curframe, true);
verbose(env, "old state:");
- print_verifier_state(env, sl->state.frame[cur->curframe], true);
+ print_verifier_state(env, &sl->state, cur->curframe, true);
return -EINVAL;
}
/* if the verifier is processing a loop, avoid adding new state
@@ -18603,7 +19006,7 @@ static int do_check(struct bpf_verifier_env *env)
env->prev_insn_idx, env->insn_idx,
env->cur_state->speculative ?
" (speculative execution)" : "");
- print_verifier_state(env, state->frame[state->curframe], true);
+ print_verifier_state(env, state, state->curframe, true);
do_print_state = false;
}
@@ -18615,7 +19018,7 @@ static int do_check(struct bpf_verifier_env *env)
};
if (verifier_state_scratched(env))
- print_insn_state(env, state->frame[state->curframe]);
+ print_insn_state(env, state, state->curframe);
verbose_linfo(env, env->insn_idx, "; ");
env->prev_log_pos = env->log.end_pos;
@@ -18747,10 +19150,10 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- if (cur_func(env)->active_locks) {
+ if (env->cur_state->active_locks) {
if ((insn->src_reg == BPF_REG_0 && insn->imm != BPF_FUNC_spin_unlock) ||
(insn->src_reg == BPF_PSEUDO_KFUNC_CALL &&
- (insn->off != 0 || !is_bpf_graph_api_kfunc(insn->imm)))) {
+ (insn->off != 0 || !kfunc_spin_allowed(insn->imm)))) {
verbose(env, "function calls are not allowed while holding a lock\n");
return -EINVAL;
}
@@ -18803,7 +19206,7 @@ process_bpf_exit_full:
* match caller reference state when it exits.
*/
err = check_resource_leak(env, exception_exit, !env->cur_state->curframe,
- "BPF_EXIT instruction");
+ "BPF_EXIT instruction in main prog");
if (err)
return err;
@@ -18910,50 +19313,68 @@ static int find_btf_percpu_datasec(struct btf *btf)
return -ENOENT;
}
+/*
+ * Add btf to the used_btfs array and return the index. (If the btf was
+ * already added, then just return the index.) Upon successful insertion
+ * increase btf refcnt, and, if present, also refcount the corresponding
+ * kernel module.
+ */
+static int __add_used_btf(struct bpf_verifier_env *env, struct btf *btf)
+{
+ struct btf_mod_pair *btf_mod;
+ int i;
+
+ /* check whether we recorded this BTF (and maybe module) already */
+ for (i = 0; i < env->used_btf_cnt; i++)
+ if (env->used_btfs[i].btf == btf)
+ return i;
+
+ if (env->used_btf_cnt >= MAX_USED_BTFS)
+ return -E2BIG;
+
+ btf_get(btf);
+
+ btf_mod = &env->used_btfs[env->used_btf_cnt];
+ btf_mod->btf = btf;
+ btf_mod->module = NULL;
+
+ /* if we reference variables from kernel module, bump its refcount */
+ if (btf_is_module(btf)) {
+ btf_mod->module = btf_try_get_module(btf);
+ if (!btf_mod->module) {
+ btf_put(btf);
+ return -ENXIO;
+ }
+ }
+
+ return env->used_btf_cnt++;
+}
+
/* replace pseudo btf_id with kernel symbol address */
-static int check_pseudo_btf_id(struct bpf_verifier_env *env,
- struct bpf_insn *insn,
- struct bpf_insn_aux_data *aux)
+static int __check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux,
+ struct btf *btf)
{
const struct btf_var_secinfo *vsi;
const struct btf_type *datasec;
- struct btf_mod_pair *btf_mod;
const struct btf_type *t;
const char *sym_name;
bool percpu = false;
u32 type, id = insn->imm;
- struct btf *btf;
s32 datasec_id;
u64 addr;
- int i, btf_fd, err;
-
- btf_fd = insn[1].imm;
- if (btf_fd) {
- btf = btf_get_by_fd(btf_fd);
- if (IS_ERR(btf)) {
- verbose(env, "invalid module BTF object FD specified.\n");
- return -EINVAL;
- }
- } else {
- if (!btf_vmlinux) {
- verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
- return -EINVAL;
- }
- btf = btf_vmlinux;
- btf_get(btf);
- }
+ int i;
t = btf_type_by_id(btf, id);
if (!t) {
verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
}
if (!btf_type_is_var(t) && !btf_type_is_func(t)) {
verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR or KIND_FUNC\n", id);
- err = -EINVAL;
- goto err_put;
+ return -EINVAL;
}
sym_name = btf_name_by_offset(btf, t->name_off);
@@ -18961,8 +19382,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
if (!addr) {
verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n",
sym_name);
- err = -ENOENT;
- goto err_put;
+ return -ENOENT;
}
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
@@ -18970,7 +19390,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
if (btf_type_is_func(t)) {
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = 0;
- goto check_btf;
+ return 0;
}
datasec_id = find_btf_percpu_datasec(btf);
@@ -19001,8 +19421,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
tname = btf_name_by_offset(btf, t->name_off);
verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n",
tname, PTR_ERR(ret));
- err = -EINVAL;
- goto err_put;
+ return -EINVAL;
}
aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
aux->btf_var.mem_size = tsize;
@@ -19011,39 +19430,43 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env,
aux->btf_var.btf = btf;
aux->btf_var.btf_id = type;
}
-check_btf:
- /* check whether we recorded this BTF (and maybe module) already */
- for (i = 0; i < env->used_btf_cnt; i++) {
- if (env->used_btfs[i].btf == btf) {
- btf_put(btf);
- return 0;
- }
- }
- if (env->used_btf_cnt >= MAX_USED_BTFS) {
- err = -E2BIG;
- goto err_put;
- }
+ return 0;
+}
- btf_mod = &env->used_btfs[env->used_btf_cnt];
- btf_mod->btf = btf;
- btf_mod->module = NULL;
+static int check_pseudo_btf_id(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ struct bpf_insn_aux_data *aux)
+{
+ struct btf *btf;
+ int btf_fd;
+ int err;
- /* if we reference variables from kernel module, bump its refcount */
- if (btf_is_module(btf)) {
- btf_mod->module = btf_try_get_module(btf);
- if (!btf_mod->module) {
- err = -ENXIO;
- goto err_put;
+ btf_fd = insn[1].imm;
+ if (btf_fd) {
+ CLASS(fd, f)(btf_fd);
+
+ btf = __btf_get_by_fd(f);
+ if (IS_ERR(btf)) {
+ verbose(env, "invalid module BTF object FD specified.\n");
+ return -EINVAL;
}
+ } else {
+ if (!btf_vmlinux) {
+ verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n");
+ return -EINVAL;
+ }
+ btf = btf_vmlinux;
}
- env->used_btf_cnt++;
+ err = __check_pseudo_btf_id(env, insn, aux, btf);
+ if (err)
+ return err;
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
return 0;
-err_put:
- btf_put(btf);
- return err;
}
static bool is_tracing_prog_type(enum bpf_prog_type type)
@@ -19060,6 +19483,12 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
}
}
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+{
+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+}
+
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
@@ -19138,39 +19567,47 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return -EINVAL;
}
- return 0;
-}
+ if (bpf_map_is_cgroup_storage(map) &&
+ bpf_cgroup_storage_assign(env->prog->aux, map)) {
+ verbose(env, "only one cgroup storage of each type is allowed\n");
+ return -EBUSY;
+ }
-static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
-{
- return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
- map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+ if (map->map_type == BPF_MAP_TYPE_ARENA) {
+ if (env->prog->aux->arena) {
+ verbose(env, "Only one arena per program\n");
+ return -EBUSY;
+ }
+ if (!env->allow_ptr_leaks || !env->bpf_capable) {
+ verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
+ return -EPERM;
+ }
+ if (!env->prog->jit_requested) {
+ verbose(env, "JIT is required to use arena\n");
+ return -EOPNOTSUPP;
+ }
+ if (!bpf_jit_supports_arena()) {
+ verbose(env, "JIT doesn't support arena\n");
+ return -EOPNOTSUPP;
+ }
+ env->prog->aux->arena = (void *)map;
+ if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
+ verbose(env, "arena's user address must be set via map_extra or mmap()\n");
+ return -EINVAL;
+ }
+ }
+
+ return 0;
}
-/* Add map behind fd to used maps list, if it's not already there, and return
- * its index. Also set *reused to true if this map was already in the list of
- * used maps.
- * Returns <0 on error, or >= 0 index, on success.
- */
-static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reused)
+static int __add_used_map(struct bpf_verifier_env *env, struct bpf_map *map)
{
- CLASS(fd, f)(fd);
- struct bpf_map *map;
- int i;
-
- map = __bpf_map_get(f);
- if (IS_ERR(map)) {
- verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
- return PTR_ERR(map);
- }
+ int i, err;
/* check whether we recorded this map already */
- for (i = 0; i < env->used_map_cnt; i++) {
- if (env->used_maps[i] == map) {
- *reused = true;
+ for (i = 0; i < env->used_map_cnt; i++)
+ if (env->used_maps[i] == map)
return i;
- }
- }
if (env->used_map_cnt >= MAX_USED_MAPS) {
verbose(env, "The total number of maps per program has reached the limit of %u\n",
@@ -19178,6 +19615,10 @@ static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reus
return -E2BIG;
}
+ err = check_map_prog_compatibility(env, map, env->prog);
+ if (err)
+ return err;
+
if (env->prog->sleepable)
atomic64_inc(&map->sleepable_refcnt);
@@ -19188,12 +19629,29 @@ static int add_used_map_from_fd(struct bpf_verifier_env *env, int fd, bool *reus
*/
bpf_map_inc(map);
- *reused = false;
env->used_maps[env->used_map_cnt++] = map;
return env->used_map_cnt - 1;
}
+/* Add map behind fd to used maps list, if it's not already there, and return
+ * its index.
+ * Returns <0 on error, or >= 0 index, on success.
+ */
+static int add_used_map(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ CLASS(fd, f)(fd);
+
+ map = __bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose(env, "fd %d is not pointing to valid bpf_map\n", fd);
+ return PTR_ERR(map);
+ }
+
+ return __add_used_map(env, map);
+}
+
/* find and rewrite pseudo imm in ld_imm64 instructions:
*
* 1. if it accesses map FD, replace it with actual map pointer.
@@ -19225,7 +19683,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
int map_idx;
u64 addr;
u32 fd;
- bool reused;
if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
@@ -19286,7 +19743,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
break;
}
- map_idx = add_used_map_from_fd(env, fd, &reused);
+ map_idx = add_used_map(env, fd);
if (map_idx < 0)
return map_idx;
map = env->used_maps[map_idx];
@@ -19294,10 +19751,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
aux = &env->insn_aux_data[i];
aux->map_index = map_idx;
- err = check_map_prog_compatibility(env, map, env->prog);
- if (err)
- return err;
-
if (insn[0].src_reg == BPF_PSEUDO_MAP_FD ||
insn[0].src_reg == BPF_PSEUDO_MAP_IDX) {
addr = (unsigned long)map;
@@ -19328,39 +19781,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env)
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
- /* proceed with extra checks only if its newly added used map */
- if (reused)
- goto next_insn;
-
- if (bpf_map_is_cgroup_storage(map) &&
- bpf_cgroup_storage_assign(env->prog->aux, map)) {
- verbose(env, "only one cgroup storage of each type is allowed\n");
- return -EBUSY;
- }
- if (map->map_type == BPF_MAP_TYPE_ARENA) {
- if (env->prog->aux->arena) {
- verbose(env, "Only one arena per program\n");
- return -EBUSY;
- }
- if (!env->allow_ptr_leaks || !env->bpf_capable) {
- verbose(env, "CAP_BPF and CAP_PERFMON are required to use arena\n");
- return -EPERM;
- }
- if (!env->prog->jit_requested) {
- verbose(env, "JIT is required to use arena\n");
- return -EOPNOTSUPP;
- }
- if (!bpf_jit_supports_arena()) {
- verbose(env, "JIT doesn't support arena\n");
- return -EOPNOTSUPP;
- }
- env->prog->aux->arena = (void *)map;
- if (!bpf_arena_get_user_vm_start(env->prog->aux->arena)) {
- verbose(env, "arena's user address must be set via map_extra or mmap()\n");
- return -EINVAL;
- }
- }
-
next_insn:
insn++;
i++;
@@ -19779,23 +20199,28 @@ static int opt_remove_dead_code(struct bpf_verifier_env *env)
}
static const struct bpf_insn NOP = BPF_JMP_IMM(BPF_JA, 0, 0, 0);
+static const struct bpf_insn MAY_GOTO_0 = BPF_RAW_INSN(BPF_JMP | BPF_JCOND, 0, 0, 0, 0);
static int opt_remove_nops(struct bpf_verifier_env *env)
{
- const struct bpf_insn ja = NOP;
struct bpf_insn *insn = env->prog->insnsi;
int insn_cnt = env->prog->len;
+ bool is_may_goto_0, is_ja;
int i, err;
for (i = 0; i < insn_cnt; i++) {
- if (memcmp(&insn[i], &ja, sizeof(ja)))
+ is_may_goto_0 = !memcmp(&insn[i], &MAY_GOTO_0, sizeof(MAY_GOTO_0));
+ is_ja = !memcmp(&insn[i], &NOP, sizeof(NOP));
+
+ if (!is_may_goto_0 && !is_ja)
continue;
err = verifier_remove_insns(env, i, 1);
if (err)
return err;
insn_cnt--;
- i--;
+ /* Go back one insn to catch may_goto +1; may_goto +0 sequence */
+ i -= (is_may_goto_0 && i > 0) ? 2 : 1;
}
return 0;
@@ -21281,11 +21706,15 @@ patch_map_ops_generic:
* changed in some incompatible and hard to support
* way, it's fine to back out this inlining logic
*/
+#ifdef CONFIG_SMP
insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
cnt = 3;
-
+#else
+ insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0);
+ cnt = 1;
+#endif
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -22540,6 +22969,73 @@ struct btf *bpf_get_btf_vmlinux(void)
return btf_vmlinux;
}
+/*
+ * The add_fd_from_fd_array() is executed only if fd_array_cnt is non-zero. In
+ * this case expect that every file descriptor in the array is either a map or
+ * a BTF. Everything else is considered to be trash.
+ */
+static int add_fd_from_fd_array(struct bpf_verifier_env *env, int fd)
+{
+ struct bpf_map *map;
+ struct btf *btf;
+ CLASS(fd, f)(fd);
+ int err;
+
+ map = __bpf_map_get(f);
+ if (!IS_ERR(map)) {
+ err = __add_used_map(env, map);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ btf = __btf_get_by_fd(f);
+ if (!IS_ERR(btf)) {
+ err = __add_used_btf(env, btf);
+ if (err < 0)
+ return err;
+ return 0;
+ }
+
+ verbose(env, "fd %d is not pointing to valid bpf_map or btf\n", fd);
+ return PTR_ERR(map);
+}
+
+static int process_fd_array(struct bpf_verifier_env *env, union bpf_attr *attr, bpfptr_t uattr)
+{
+ size_t size = sizeof(int);
+ int ret;
+ int fd;
+ u32 i;
+
+ env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
+
+ /*
+ * The only difference between old (no fd_array_cnt is given) and new
+ * APIs is that in the latter case the fd_array is expected to be
+ * continuous and is scanned for map fds right away
+ */
+ if (!attr->fd_array_cnt)
+ return 0;
+
+ /* Check for integer overflow */
+ if (attr->fd_array_cnt >= (U32_MAX / size)) {
+ verbose(env, "fd_array_cnt is too big (%u)\n", attr->fd_array_cnt);
+ return -EINVAL;
+ }
+
+ for (i = 0; i < attr->fd_array_cnt; i++) {
+ if (copy_from_bpfptr_offset(&fd, env->fd_array, i * size, size))
+ return -EFAULT;
+
+ ret = add_fd_from_fd_array(env, fd);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u32 uattr_size)
{
u64 start_time = ktime_get_ns();
@@ -22571,7 +23067,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
env->insn_aux_data[i].orig_idx = i;
env->prog = *prog;
env->ops = bpf_verifier_ops[env->prog->type];
- env->fd_array = make_bpfptr(attr->fd_array, uattr.is_kernel);
env->allow_ptr_leaks = bpf_allow_ptr_leaks(env->prog->aux->token);
env->allow_uninit_stack = bpf_allow_uninit_stack(env->prog->aux->token);
@@ -22594,6 +23089,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3
if (ret)
goto err_unlock;
+ ret = process_fd_array(env, attr, uattr);
+ if (ret)
+ goto skip_full_check;
+
mark_verifier_state_clean(env);
if (IS_ERR(btf_vmlinux)) {
diff --git a/kernel/capability.c b/kernel/capability.c
index dac4df77e376e..e089d2628c296 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -38,10 +38,8 @@ __setup("no_file_caps", file_caps_disable);
static void warn_legacy_capability_use(void)
{
- char name[sizeof(current->comm)];
-
pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n",
- get_task_comm(name, current));
+ current->comm);
}
/*
@@ -62,10 +60,8 @@ static void warn_legacy_capability_use(void)
static void warn_deprecated_v2(void)
{
- char name[sizeof(current->comm)];
-
pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n",
- get_task_comm(name, current));
+ current->comm);
}
/*
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index a5c9359d516f8..ede31601a363a 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -7,4 +7,5 @@ obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
obj-$(CONFIG_CGROUP_MISC) += misc.o
+obj-$(CONFIG_CGROUP_DMEM) += dmem.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index d9061bd55436b..afc665b7b1fe5 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4013,7 +4013,7 @@ static void __cgroup_kill(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- set_bit(CGRP_KILL, &cgrp->flags);
+ cgrp->kill_seq++;
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -4029,10 +4029,6 @@ static void __cgroup_kill(struct cgroup *cgrp)
send_sig(SIGKILL, task, 0);
}
css_task_iter_end(&it);
-
- spin_lock_irq(&css_set_lock);
- clear_bit(CGRP_KILL, &cgrp->flags);
- spin_unlock_irq(&css_set_lock);
}
static void cgroup_kill(struct cgroup *cgrp)
@@ -6488,6 +6484,10 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
+ if (kargs->cgrp)
+ kargs->kill_seq = kargs->cgrp->kill_seq;
+ else
+ kargs->kill_seq = cset->dfl_cgrp->kill_seq;
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
@@ -6668,6 +6668,7 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned int cgrp_kill_seq = 0;
unsigned long cgrp_flags = 0;
bool kill = false;
struct cgroup_subsys *ss;
@@ -6681,10 +6682,13 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
- if (kargs->cgrp)
+ if (kargs->cgrp) {
cgrp_flags = kargs->cgrp->flags;
- else
+ cgrp_kill_seq = kargs->cgrp->kill_seq;
+ } else {
cgrp_flags = cset->dfl_cgrp->flags;
+ cgrp_kill_seq = cset->dfl_cgrp->kill_seq;
+ }
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
@@ -6719,7 +6723,7 @@ void cgroup_post_fork(struct task_struct *child,
* child down right after we finished preparing it for
* userspace.
*/
- kill = test_bit(CGRP_KILL, &cgrp_flags);
+ kill = kargs->kill_seq != cgrp_kill_seq;
}
spin_unlock_irq(&css_set_lock);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f321ed515f3a7..0f910c828973a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -197,10 +197,8 @@ static struct cpuset top_cpuset = {
/*
* There are two global locks guarding cpuset structures - cpuset_mutex and
- * callback_lock. We also require taking task_lock() when dereferencing a
- * task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
- * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel
+ * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
* structures. Note that cpuset_mutex needs to be a mutex as it is used in
* paths that rely on priority inheritance (e.g. scheduler - on RT) for
* correctness.
@@ -229,9 +227,6 @@ static struct cpuset top_cpuset = {
* The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
- *
- * Accessing a task's cpuset should be done in accordance with the
- * guidelines for accessing subsystem state in kernel/cgroup.c
*/
static DEFINE_MUTEX(cpuset_mutex);
@@ -890,7 +885,15 @@ v2:
*/
if (cgrpv2) {
for (i = 0; i < ndoms; i++) {
- cpumask_copy(doms[i], csa[i]->effective_cpus);
+ /*
+ * The top cpuset may contain some boot time isolated
+ * CPUs that need to be excluded from the sched domain.
+ */
+ if (csa[i] == &top_cpuset)
+ cpumask_and(doms[i], csa[i]->effective_cpus,
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
+ else
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
if (dattr)
dattr[i] = SD_ATTR_INIT;
}
@@ -3121,29 +3124,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
int retval = -ENODEV;
buf = strstrip(buf);
-
- /*
- * CPU or memory hotunplug may leave @cs w/o any execution
- * resources, in which case the hotplug code asynchronously updates
- * configuration and transfers all tasks to the nearest ancestor
- * which can execute.
- *
- * As writes to "cpus" or "mems" may restore @cs's execution
- * resources, wait for the previously scheduled operations before
- * proceeding, so that we don't end up keep removing tasks added
- * after execution capability is restored.
- *
- * cpuset_handle_hotplug may call back into cgroup core asynchronously
- * via cgroup_transfer_tasks() and waiting for it from a cgroupfs
- * operation like this one can lead to a deadlock through kernfs
- * active_ref protection. Let's break the protection. Losing the
- * protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
- * hierarchies.
- */
- css_get(&cs->css);
- kernfs_break_active_protection(of->kn);
-
cpus_read_lock();
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
@@ -3176,8 +3156,6 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
out_unlock:
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
- kernfs_unbreak_active_protection(of->kn);
- css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
new file mode 100644
index 0000000000000..10b63433f0573
--- /dev/null
+++ b/kernel/cgroup/dmem.c
@@ -0,0 +1,829 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
+ * Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
+ * Partially based on the rdma and misc controllers, which bear the following copyrights:
+ *
+ * Copyright 2020 Google LLC
+ * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
+ */
+
+#include <linux/cgroup.h>
+#include <linux/cgroup_dmem.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/page_counter.h>
+#include <linux/parser.h>
+#include <linux/slab.h>
+
+struct dmem_cgroup_region {
+ /**
+ * @ref: References keeping the region alive.
+ * Keeps the region reference alive after a succesful RCU lookup.
+ */
+ struct kref ref;
+
+ /** @rcu: RCU head for freeing */
+ struct rcu_head rcu;
+
+ /**
+ * @region_node: Linked into &dmem_cgroup_regions list.
+ * Protected by RCU and global spinlock.
+ */
+ struct list_head region_node;
+
+ /**
+ * @pools: List of pools linked to this region.
+ * Protected by global spinlock only
+ */
+ struct list_head pools;
+
+ /** @size: Size of region, in bytes */
+ u64 size;
+
+ /** @name: Name describing the node, set by dmem_cgroup_register_region */
+ char *name;
+
+ /**
+ * @unregistered: Whether the region is unregistered by its caller.
+ * No new pools should be added to the region afterwards.
+ */
+ bool unregistered;
+};
+
+struct dmemcg_state {
+ struct cgroup_subsys_state css;
+
+ struct list_head pools;
+};
+
+struct dmem_cgroup_pool_state {
+ struct dmem_cgroup_region *region;
+ struct dmemcg_state *cs;
+
+ /* css node, RCU protected against region teardown */
+ struct list_head css_node;
+
+ /* dev node, no RCU protection required */
+ struct list_head region_node;
+
+ struct rcu_head rcu;
+
+ struct page_counter cnt;
+
+ bool inited;
+};
+
+/*
+ * 3 operations require locking protection:
+ * - Registering and unregistering region to/from list, requires global lock.
+ * - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
+ * - Adding a dmem_cgroup_pool_state to a region list.
+ *
+ * Since for the most common operations RCU provides enough protection, I
+ * do not think more granular locking makes sense. Most protection is offered
+ * by RCU and the lockless operating page_counter.
+ */
+static DEFINE_SPINLOCK(dmemcg_lock);
+static LIST_HEAD(dmem_cgroup_regions);
+
+static inline struct dmemcg_state *
+css_to_dmemcs(struct cgroup_subsys_state *css)
+{
+ return container_of(css, struct dmemcg_state, css);
+}
+
+static inline struct dmemcg_state *get_current_dmemcs(void)
+{
+ return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
+}
+
+static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
+{
+ return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
+}
+
+static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
+{
+ list_del(&pool->region_node);
+ kfree(pool);
+}
+
+static void
+set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_min(&pool->cnt, val);
+}
+
+static void
+set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_low(&pool->cnt, val);
+}
+
+static void
+set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
+{
+ page_counter_set_max(&pool->cnt, val);
+}
+
+static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.low) : 0;
+}
+
+static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.min) : 0;
+}
+
+static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
+}
+
+static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
+{
+ return pool ? page_counter_read(&pool->cnt) : 0;
+}
+
+static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
+{
+ set_resource_min(rpool, 0);
+ set_resource_low(rpool, 0);
+ set_resource_max(rpool, PAGE_COUNTER_MAX);
+}
+
+static void dmemcs_offline(struct cgroup_subsys_state *css)
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(css);
+ struct dmem_cgroup_pool_state *pool;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
+ reset_all_resource_limits(pool);
+ rcu_read_unlock();
+}
+
+static void dmemcs_free(struct cgroup_subsys_state *css)
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(css);
+ struct dmem_cgroup_pool_state *pool, *next;
+
+ spin_lock(&dmemcg_lock);
+ list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
+ /*
+ *The pool is dead and all references are 0,
+ * no need for RCU protection with list_del_rcu or freeing.
+ */
+ list_del(&pool->css_node);
+ free_cg_pool(pool);
+ }
+ spin_unlock(&dmemcg_lock);
+
+ kfree(dmemcs);
+}
+
+static struct cgroup_subsys_state *
+dmemcs_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
+ if (!dmemcs)
+ return ERR_PTR(-ENOMEM);
+
+ INIT_LIST_HEAD(&dmemcs->pools);
+ return &dmemcs->css;
+}
+
+static struct dmem_cgroup_pool_state *
+find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
+{
+ struct dmem_cgroup_pool_state *pool;
+
+ list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
+ if (pool->region == region)
+ return pool;
+
+ return NULL;
+}
+
+static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
+{
+ if (!pool->cnt.parent)
+ return NULL;
+
+ return container_of(pool->cnt.parent, typeof(*pool), cnt);
+}
+
+static void
+dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
+ struct dmem_cgroup_pool_state *test_pool)
+{
+ struct page_counter *climit;
+ struct cgroup_subsys_state *css;
+ struct dmemcg_state *dmemcg_iter;
+ struct dmem_cgroup_pool_state *pool, *found_pool;
+
+ climit = &limit_pool->cnt;
+
+ rcu_read_lock();
+
+ css_for_each_descendant_pre(css, &limit_pool->cs->css) {
+ dmemcg_iter = container_of(css, struct dmemcg_state, css);
+ found_pool = NULL;
+
+ list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
+ if (pool->region == limit_pool->region) {
+ found_pool = pool;
+ break;
+ }
+ }
+ if (!found_pool)
+ continue;
+
+ page_counter_calculate_protection(
+ climit, &found_pool->cnt, true);
+
+ if (found_pool == test_pool)
+ break;
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
+ * @limit_pool: The pool for which we hit limits
+ * @test_pool: The pool for which to test
+ * @ignore_low: Whether we have to respect low watermarks.
+ * @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
+ *
+ * This function returns true if we can evict from @test_pool, false if not.
+ * When returning false and @ignore_low is false, @ret_hit_low may
+ * be set to true to indicate this function can be retried with @ignore_low
+ * set to true.
+ *
+ * Return: bool
+ */
+bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
+ struct dmem_cgroup_pool_state *test_pool,
+ bool ignore_low, bool *ret_hit_low)
+{
+ struct dmem_cgroup_pool_state *pool = test_pool;
+ struct page_counter *ctest;
+ u64 used, min, low;
+
+ /* Can always evict from current pool, despite limits */
+ if (limit_pool == test_pool)
+ return true;
+
+ if (limit_pool) {
+ if (!parent_dmemcs(limit_pool->cs))
+ return true;
+
+ for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
+ {}
+
+ if (!pool)
+ return false;
+ } else {
+ /*
+ * If there is no cgroup limiting memory usage, use the root
+ * cgroup instead for limit calculations.
+ */
+ for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
+ {}
+ }
+
+ ctest = &test_pool->cnt;
+
+ dmem_cgroup_calculate_protection(limit_pool, test_pool);
+
+ used = page_counter_read(ctest);
+ min = READ_ONCE(ctest->emin);
+
+ if (used <= min)
+ return false;
+
+ if (!ignore_low) {
+ low = READ_ONCE(ctest->elow);
+ if (used > low)
+ return true;
+
+ *ret_hit_low = true;
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
+
+static struct dmem_cgroup_pool_state *
+alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
+ struct dmem_cgroup_pool_state **allocpool)
+{
+ struct dmemcg_state *parent = parent_dmemcs(dmemcs);
+ struct dmem_cgroup_pool_state *pool, *ppool = NULL;
+
+ if (!*allocpool) {
+ pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+ } else {
+ pool = *allocpool;
+ *allocpool = NULL;
+ }
+
+ pool->region = region;
+ pool->cs = dmemcs;
+
+ if (parent)
+ ppool = find_cg_pool_locked(parent, region);
+
+ page_counter_init(&pool->cnt,
+ ppool ? &ppool->cnt : NULL, true);
+ reset_all_resource_limits(pool);
+
+ list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
+ list_add_tail(&pool->region_node, &region->pools);
+
+ if (!parent)
+ pool->inited = true;
+ else
+ pool->inited = ppool ? ppool->inited : false;
+ return pool;
+}
+
+static struct dmem_cgroup_pool_state *
+get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
+ struct dmem_cgroup_pool_state **allocpool)
+{
+ struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
+ struct dmemcg_state *p, *pp;
+
+ /*
+ * Recursively create pool, we may not initialize yet on
+ * recursion, this is done as a separate step.
+ */
+ for (p = dmemcs; p; p = parent_dmemcs(p)) {
+ pool = find_cg_pool_locked(p, region);
+ if (!pool)
+ pool = alloc_pool_single(p, region, allocpool);
+
+ if (IS_ERR(pool))
+ return pool;
+
+ if (p == dmemcs && pool->inited)
+ return pool;
+
+ if (pool->inited)
+ break;
+ }
+
+ retpool = pool = find_cg_pool_locked(dmemcs, region);
+ for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
+ if (pool->inited)
+ break;
+
+ /* ppool was created if it didn't exist by above loop. */
+ ppool = find_cg_pool_locked(pp, region);
+
+ /* Fix up parent links, mark as inited. */
+ pool->cnt.parent = &ppool->cnt;
+ pool->inited = true;
+
+ pool = ppool;
+ }
+
+ return retpool;
+}
+
+static void dmemcg_free_rcu(struct rcu_head *rcu)
+{
+ struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
+ struct dmem_cgroup_pool_state *pool, *next;
+
+ list_for_each_entry_safe(pool, next, &region->pools, region_node)
+ free_cg_pool(pool);
+ kfree(region->name);
+ kfree(region);
+}
+
+static void dmemcg_free_region(struct kref *ref)
+{
+ struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
+
+ call_rcu(&cgregion->rcu, dmemcg_free_rcu);
+}
+
+/**
+ * dmem_cgroup_unregister_region() - Unregister a previously registered region.
+ * @region: The region to unregister.
+ *
+ * This function undoes dmem_cgroup_register_region.
+ */
+void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
+{
+ struct list_head *entry;
+
+ if (!region)
+ return;
+
+ spin_lock(&dmemcg_lock);
+
+ /* Remove from global region list */
+ list_del_rcu(&region->region_node);
+
+ list_for_each_rcu(entry, &region->pools) {
+ struct dmem_cgroup_pool_state *pool =
+ container_of(entry, typeof(*pool), region_node);
+
+ list_del_rcu(&pool->css_node);
+ }
+
+ /*
+ * Ensure any RCU based lookups fail. Additionally,
+ * no new pools should be added to the dead region
+ * by get_cg_pool_unlocked.
+ */
+ region->unregistered = true;
+ spin_unlock(&dmemcg_lock);
+
+ kref_put(&region->ref, dmemcg_free_region);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
+
+/**
+ * dmem_cgroup_register_region() - Register a regions for dev cgroup.
+ * @size: Size of region to register, in bytes.
+ * @fmt: Region parameters to register
+ *
+ * This function registers a node in the dmem cgroup with the
+ * name given. After calling this function, the region can be
+ * used for allocations.
+ *
+ * Return: NULL or a struct on success, PTR_ERR on failure.
+ */
+struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
+{
+ struct dmem_cgroup_region *ret;
+ char *region_name;
+ va_list ap;
+
+ if (!size)
+ return NULL;
+
+ va_start(ap, fmt);
+ region_name = kvasprintf(GFP_KERNEL, fmt, ap);
+ va_end(ap);
+ if (!region_name)
+ return ERR_PTR(-ENOMEM);
+
+ ret = kzalloc(sizeof(*ret), GFP_KERNEL);
+ if (!ret) {
+ kfree(region_name);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ INIT_LIST_HEAD(&ret->pools);
+ ret->name = region_name;
+ ret->size = size;
+ kref_init(&ret->ref);
+
+ spin_lock(&dmemcg_lock);
+ list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
+ spin_unlock(&dmemcg_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
+
+static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
+{
+ struct dmem_cgroup_region *region;
+
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
+ if (!strcmp(name, region->name) &&
+ kref_get_unless_zero(&region->ref))
+ return region;
+
+ return NULL;
+}
+
+/**
+ * dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
+ * @pool: &dmem_cgroup_pool_state
+ *
+ * Called to drop a reference to the limiting pool returned by
+ * dmem_cgroup_try_charge().
+ */
+void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
+{
+ if (pool)
+ css_put(&pool->cs->css);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
+
+static struct dmem_cgroup_pool_state *
+get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
+{
+ struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
+
+ /* fastpath lookup? */
+ rcu_read_lock();
+ pool = find_cg_pool_locked(cg, region);
+ if (pool && !READ_ONCE(pool->inited))
+ pool = NULL;
+ rcu_read_unlock();
+
+ while (!pool) {
+ spin_lock(&dmemcg_lock);
+ if (!region->unregistered)
+ pool = get_cg_pool_locked(cg, region, &allocpool);
+ else
+ pool = ERR_PTR(-ENODEV);
+ spin_unlock(&dmemcg_lock);
+
+ if (pool == ERR_PTR(-ENOMEM)) {
+ pool = NULL;
+ if (WARN_ON(allocpool))
+ continue;
+
+ allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
+ if (allocpool) {
+ pool = NULL;
+ continue;
+ }
+ }
+ }
+
+ kfree(allocpool);
+ return pool;
+}
+
+/**
+ * dmem_cgroup_uncharge() - Uncharge a pool.
+ * @pool: Pool to uncharge.
+ * @size: Size to uncharge.
+ *
+ * Undoes the effects of dmem_cgroup_try_charge.
+ * Must be called with the returned pool as argument,
+ * and same @index and @size.
+ */
+void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
+{
+ if (!pool)
+ return;
+
+ page_counter_uncharge(&pool->cnt, size);
+ css_put(&pool->cs->css);
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
+
+/**
+ * dmem_cgroup_try_charge() - Try charging a new allocation to a region.
+ * @region: dmem region to charge
+ * @size: Size (in bytes) to charge.
+ * @ret_pool: On succesfull allocation, the pool that is charged.
+ * @ret_limit_pool: On a failed allocation, the limiting pool.
+ *
+ * This function charges the @region region for a size of @size bytes.
+ *
+ * If the function succeeds, @ret_pool is set, which must be passed to
+ * dmem_cgroup_uncharge() when undoing the allocation.
+ *
+ * When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
+ * will be set to the pool for which the limit is hit. This can be used for
+ * eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
+ * with @dmem_cgroup_pool_state_put().
+ *
+ * Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
+ */
+int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
+ struct dmem_cgroup_pool_state **ret_pool,
+ struct dmem_cgroup_pool_state **ret_limit_pool)
+{
+ struct dmemcg_state *cg;
+ struct dmem_cgroup_pool_state *pool;
+ struct page_counter *fail;
+ int ret;
+
+ *ret_pool = NULL;
+ if (ret_limit_pool)
+ *ret_limit_pool = NULL;
+
+ /*
+ * hold on to css, as cgroup can be removed but resource
+ * accounting happens on css.
+ */
+ cg = get_current_dmemcs();
+
+ pool = get_cg_pool_unlocked(cg, region);
+ if (IS_ERR(pool)) {
+ ret = PTR_ERR(pool);
+ goto err;
+ }
+
+ if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
+ if (ret_limit_pool) {
+ *ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
+ css_get(&(*ret_limit_pool)->cs->css);
+ }
+ ret = -EAGAIN;
+ goto err;
+ }
+
+ /* On success, reference from get_current_dmemcs is transferred to *ret_pool */
+ *ret_pool = pool;
+ return 0;
+
+err:
+ css_put(&cg->css);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
+
+static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
+{
+ struct dmem_cgroup_region *region;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+ seq_puts(sf, region->name);
+ seq_printf(sf, " %llu\n", region->size);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
+ u64 *new_limit)
+{
+ char *end;
+
+ if (!strcmp(options, "max")) {
+ *new_limit = PAGE_COUNTER_MAX;
+ return 0;
+ }
+
+ *new_limit = memparse(options, &end);
+ if (*end != '\0')
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off,
+ void (*apply)(struct dmem_cgroup_pool_state *, u64))
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
+ int err = 0;
+
+ while (buf && !err) {
+ struct dmem_cgroup_pool_state *pool = NULL;
+ char *options, *region_name;
+ struct dmem_cgroup_region *region;
+ u64 new_limit;
+
+ options = buf;
+ buf = strchr(buf, '\n');
+ if (buf)
+ *buf++ = '\0';
+
+ options = strstrip(options);
+
+ /* eat empty lines */
+ if (!options[0])
+ continue;
+
+ region_name = strsep(&options, " \t");
+ if (!region_name[0])
+ continue;
+
+ rcu_read_lock();
+ region = dmemcg_get_region_by_name(region_name);
+ rcu_read_unlock();
+
+ if (!region)
+ return -EINVAL;
+
+ err = dmemcg_parse_limit(options, region, &new_limit);
+ if (err < 0)
+ goto out_put;
+
+ pool = get_cg_pool_unlocked(dmemcs, region);
+ if (IS_ERR(pool)) {
+ err = PTR_ERR(pool);
+ goto out_put;
+ }
+
+ /* And commit */
+ apply(pool, new_limit);
+
+out_put:
+ kref_put(&region->ref, dmemcg_free_region);
+ }
+
+
+ return err ?: nbytes;
+}
+
+static int dmemcg_limit_show(struct seq_file *sf, void *v,
+ u64 (*fn)(struct dmem_cgroup_pool_state *))
+{
+ struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
+ struct dmem_cgroup_region *region;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
+ struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
+ u64 val;
+
+ seq_puts(sf, region->name);
+
+ val = fn(pool);
+ if (val < PAGE_COUNTER_MAX)
+ seq_printf(sf, " %lld\n", val);
+ else
+ seq_puts(sf, " max\n");
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_current);
+}
+
+static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_min);
+}
+
+static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
+}
+
+static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_low);
+}
+
+static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
+}
+
+static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
+{
+ return dmemcg_limit_show(sf, v, get_resource_max);
+}
+
+static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
+}
+
+static struct cftype files[] = {
+ {
+ .name = "capacity",
+ .seq_show = dmem_cgroup_region_capacity_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = dmem_cgroup_region_current_show,
+ },
+ {
+ .name = "min",
+ .write = dmem_cgroup_region_min_write,
+ .seq_show = dmem_cgroup_region_min_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "low",
+ .write = dmem_cgroup_region_low_write,
+ .seq_show = dmem_cgroup_region_low_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "max",
+ .write = dmem_cgroup_region_max_write,
+ .seq_show = dmem_cgroup_region_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* Zero entry terminates. */
+};
+
+struct cgroup_subsys dmem_cgrp_subsys = {
+ .css_alloc = dmemcs_alloc,
+ .css_free = dmemcs_free,
+ .css_offline = dmemcs_offline,
+ .legacy_cftypes = files,
+ .dfl_cftypes = files,
+};
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 5877974ece92c..aac91466279f1 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -590,7 +590,6 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
- cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
#ifdef CONFIG_SCHED_CORE
bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b605334f8ee6a..07455d25329c9 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -905,12 +905,13 @@ static int finish_cpu(unsigned int cpu)
struct mm_struct *mm = idle->active_mm;
/*
- * idle_task_exit() will have switched to &init_mm, now
- * clean up any remaining active_mm state.
+ * sched_force_init_mm() ensured the use of &init_mm,
+ * drop that refcount now that the CPU has stopped.
*/
- if (mm != &init_mm)
- idle->active_mm = &init_mm;
+ WARN_ON(mm != &init_mm);
+ idle->active_mm = NULL;
mmdrop_lazy_tlb(mm);
+
return 0;
}
@@ -2179,7 +2180,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
},
[CPUHP_AP_HRTIMERS_DYING] = {
.name = "hrtimers:dying",
- .startup.single = NULL,
+ .startup.single = hrtimers_cpu_starting,
.teardown.single = hrtimers_cpu_dying,
},
[CPUHP_AP_TICK_DYING] = {
@@ -3128,11 +3129,6 @@ void init_cpu_possible(const struct cpumask *src)
cpumask_copy(&__cpu_possible_mask, src);
}
-void init_cpu_online(const struct cpumask *src)
-{
- cpumask_copy(&__cpu_online_mask, src);
-}
-
void set_cpu_online(unsigned int cpu, bool online)
{
/*
diff --git a/kernel/cred.c b/kernel/cred.c
index da7da250f7c8b..9676965c0981a 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -477,56 +477,6 @@ void abort_creds(struct cred *new)
EXPORT_SYMBOL(abort_creds);
/**
- * override_creds - Override the current process's subjective credentials
- * @new: The credentials to be assigned
- *
- * Install a set of temporary override subjective credentials on the current
- * process, returning the old set for later reversion.
- */
-const struct cred *override_creds(const struct cred *new)
-{
- const struct cred *old;
-
- kdebug("override_creds(%p{%ld})", new,
- atomic_long_read(&new->usage));
-
- /*
- * NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
- *
- * That means that we do not clear the 'non_rcu' flag, since
- * we are only installing the cred into the thread-synchronous
- * '->cred' pointer, not the '->real_cred' pointer that is
- * visible to other threads under RCU.
- */
- get_new_cred((struct cred *)new);
- old = override_creds_light(new);
-
- kdebug("override_creds() = %p{%ld}", old,
- atomic_long_read(&old->usage));
- return old;
-}
-EXPORT_SYMBOL(override_creds);
-
-/**
- * revert_creds - Revert a temporary subjective credentials override
- * @old: The credentials to be restored
- *
- * Revert a temporary set of override subjective credentials to an old set,
- * discarding the override set.
- */
-void revert_creds(const struct cred *old)
-{
- const struct cred *override = current->cred;
-
- kdebug("revert_creds(%p{%ld})", old,
- atomic_long_read(&old->usage));
-
- revert_creds_light(old);
- put_cred(override);
-}
-EXPORT_SYMBOL(revert_creds);
-
-/**
* cred_fscmp - Compare two credentials with respect to filesystem access.
* @a: The first credential
* @b: The second credential
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 0a39497140bfb..05b137e7dcb95 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -305,7 +305,7 @@ int kdb_putarea_size(unsigned long addr, void *res, size_t size)
/*
* kdb_getphys - Read data from a physical address. Validate the
- * address is in range, use kmap_atomic() to get data
+ * address is in range, use kmap_local_page() to get data
* similar to kdb_getarea() - but for phys addresses
* Inputs:
* res Pointer to the word to receive the result
@@ -324,9 +324,9 @@ static int kdb_getphys(void *res, unsigned long addr, size_t size)
if (!pfn_valid(pfn))
return 1;
page = pfn_to_page(pfn);
- vaddr = kmap_atomic(page);
+ vaddr = kmap_local_page(page);
memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
- kunmap_atomic(vaddr);
+ kunmap_local(vaddr);
return 0;
}
@@ -536,21 +536,3 @@ bool kdb_task_state(const struct task_struct *p, const char *mask)
return strchr(mask, state);
}
-
-/* Maintain a small stack of kdb_flags to allow recursion without disturbing
- * the global kdb state.
- */
-
-static int kdb_flags_stack[4], kdb_flags_index;
-
-void kdb_save_flags(void)
-{
- BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
- kdb_flags_stack[kdb_flags_index++] = kdb_flags;
-}
-
-void kdb_restore_flags(void)
-{
- BUG_ON(kdb_flags_index <= 0);
- kdb_flags = kdb_flags_stack[--kdb_flags_index];
-}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index dead51de8eb5d..eb63a021ac041 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -64,7 +64,7 @@ static int sysctl_delayacct(const struct ctl_table *table, int write, void *buff
return err;
}
-static struct ctl_table kern_delayacct_table[] = {
+static const struct ctl_table kern_delayacct_table[] = {
{
.procname = "task_delayacct",
.data = NULL,
@@ -93,9 +93,9 @@ void __delayacct_tsk_init(struct task_struct *tsk)
/*
* Finish delay accounting for a statistic using its timestamps (@start),
- * accumalator (@total) and @count
+ * accumulator (@total) and @count
*/
-static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count)
+static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *count, u64 *max, u64 *min)
{
s64 ns = local_clock() - *start;
unsigned long flags;
@@ -104,6 +104,10 @@ static void delayacct_end(raw_spinlock_t *lock, u64 *start, u64 *total, u32 *cou
raw_spin_lock_irqsave(lock, flags);
*total += ns;
(*count)++;
+ if (ns > *max)
+ *max = ns;
+ if (*min == 0 || ns < *min)
+ *min = ns;
raw_spin_unlock_irqrestore(lock, flags);
}
}
@@ -122,7 +126,9 @@ void __delayacct_blkio_end(struct task_struct *p)
delayacct_end(&p->delays->lock,
&p->delays->blkio_start,
&p->delays->blkio_delay,
- &p->delays->blkio_count);
+ &p->delays->blkio_count,
+ &p->delays->blkio_delay_max,
+ &p->delays->blkio_delay_min);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@@ -153,10 +159,12 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->cpu_count += t1;
+ d->cpu_delay_max = tsk->sched_info.max_run_delay;
+ d->cpu_delay_min = tsk->sched_info.min_run_delay;
tmp = (s64)d->cpu_delay_total + t2;
d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
-
tmp = (s64)d->cpu_run_virtual_total + t3;
+
d->cpu_run_virtual_total =
(tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
@@ -164,20 +172,33 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
return 0;
/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
-
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
+ d->blkio_delay_max = tsk->delays->blkio_delay_max;
+ d->blkio_delay_min = tsk->delays->blkio_delay_min;
tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+ d->swapin_delay_max = tsk->delays->swapin_delay_max;
+ d->swapin_delay_min = tsk->delays->swapin_delay_min;
tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+ d->freepages_delay_max = tsk->delays->freepages_delay_max;
+ d->freepages_delay_min = tsk->delays->freepages_delay_min;
tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
+ d->thrashing_delay_max = tsk->delays->thrashing_delay_max;
+ d->thrashing_delay_min = tsk->delays->thrashing_delay_min;
tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
+ d->compact_delay_max = tsk->delays->compact_delay_max;
+ d->compact_delay_min = tsk->delays->compact_delay_min;
tmp = d->compact_delay_total + tsk->delays->compact_delay;
d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
+ d->wpcopy_delay_max = tsk->delays->wpcopy_delay_max;
+ d->wpcopy_delay_min = tsk->delays->wpcopy_delay_min;
tmp = d->wpcopy_delay_total + tsk->delays->wpcopy_delay;
d->wpcopy_delay_total = (tmp < d->wpcopy_delay_total) ? 0 : tmp;
+ d->irq_delay_max = tsk->delays->irq_delay_max;
+ d->irq_delay_min = tsk->delays->irq_delay_min;
tmp = d->irq_delay_total + tsk->delays->irq_delay;
d->irq_delay_total = (tmp < d->irq_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
@@ -213,7 +234,9 @@ void __delayacct_freepages_end(void)
delayacct_end(&current->delays->lock,
&current->delays->freepages_start,
&current->delays->freepages_delay,
- &current->delays->freepages_count);
+ &current->delays->freepages_count,
+ &current->delays->freepages_delay_max,
+ &current->delays->freepages_delay_min);
}
void __delayacct_thrashing_start(bool *in_thrashing)
@@ -235,7 +258,9 @@ void __delayacct_thrashing_end(bool *in_thrashing)
delayacct_end(&current->delays->lock,
&current->delays->thrashing_start,
&current->delays->thrashing_delay,
- &current->delays->thrashing_count);
+ &current->delays->thrashing_count,
+ &current->delays->thrashing_delay_max,
+ &current->delays->thrashing_delay_min);
}
void __delayacct_swapin_start(void)
@@ -248,7 +273,9 @@ void __delayacct_swapin_end(void)
delayacct_end(&current->delays->lock,
&current->delays->swapin_start,
&current->delays->swapin_delay,
- &current->delays->swapin_count);
+ &current->delays->swapin_count,
+ &current->delays->swapin_delay_max,
+ &current->delays->swapin_delay_min);
}
void __delayacct_compact_start(void)
@@ -261,7 +288,9 @@ void __delayacct_compact_end(void)
delayacct_end(&current->delays->lock,
&current->delays->compact_start,
&current->delays->compact_delay,
- &current->delays->compact_count);
+ &current->delays->compact_count,
+ &current->delays->compact_delay_max,
+ &current->delays->compact_delay_min);
}
void __delayacct_wpcopy_start(void)
@@ -274,7 +303,9 @@ void __delayacct_wpcopy_end(void)
delayacct_end(&current->delays->lock,
&current->delays->wpcopy_start,
&current->delays->wpcopy_delay,
- &current->delays->wpcopy_count);
+ &current->delays->wpcopy_count,
+ &current->delays->wpcopy_delay_max,
+ &current->delays->wpcopy_delay_min);
}
void __delayacct_irq(struct task_struct *task, u32 delta)
@@ -284,6 +315,10 @@ void __delayacct_irq(struct task_struct *task, u32 delta)
raw_spin_lock_irqsave(&task->delays->lock, flags);
task->delays->irq_delay += delta;
task->delays->irq_count++;
+ if (delta > task->delays->irq_delay_max)
+ task->delays->irq_delay_max = delta;
+ if (delta && (!task->delays->irq_delay_min || delta < task->delays->irq_delay_min))
+ task->delays->irq_delay_min = delta;
raw_spin_unlock_irqrestore(&task->delays->lock, flags);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 065f9188b44a0..bcb09e011e9e1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6277,41 +6277,6 @@ unlock:
}
EXPORT_SYMBOL_GPL(perf_event_update_userpage);
-static vm_fault_t perf_mmap_fault(struct vm_fault *vmf)
-{
- struct perf_event *event = vmf->vma->vm_file->private_data;
- struct perf_buffer *rb;
- vm_fault_t ret = VM_FAULT_SIGBUS;
-
- if (vmf->flags & FAULT_FLAG_MKWRITE) {
- if (vmf->pgoff == 0)
- ret = 0;
- return ret;
- }
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
- goto unlock;
-
- vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
- if (!vmf->page)
- goto unlock;
-
- get_page(vmf->page);
- vmf->page->mapping = vmf->vma->vm_file->f_mapping;
- vmf->page->index = vmf->pgoff;
-
- ret = 0;
-unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
static void ring_buffer_attach(struct perf_event *event,
struct perf_buffer *rb)
{
@@ -6551,13 +6516,87 @@ out_put:
ring_buffer_put(rb); /* could be last */
}
+static vm_fault_t perf_mmap_pfn_mkwrite(struct vm_fault *vmf)
+{
+ /* The first page is the user control page, others are read-only. */
+ return vmf->pgoff == 0 ? 0 : VM_FAULT_SIGBUS;
+}
+
static const struct vm_operations_struct perf_mmap_vmops = {
.open = perf_mmap_open,
.close = perf_mmap_close, /* non mergeable */
- .fault = perf_mmap_fault,
- .page_mkwrite = perf_mmap_fault,
+ .pfn_mkwrite = perf_mmap_pfn_mkwrite,
};
+static int map_range(struct perf_buffer *rb, struct vm_area_struct *vma)
+{
+ unsigned long nr_pages = vma_pages(vma);
+ int err = 0;
+ unsigned long pagenum;
+
+ /*
+ * We map this as a VM_PFNMAP VMA.
+ *
+ * This is not ideal as this is designed broadly for mappings of PFNs
+ * referencing memory-mapped I/O ranges or non-system RAM i.e. for which
+ * !pfn_valid(pfn).
+ *
+ * We are mapping kernel-allocated memory (memory we manage ourselves)
+ * which would more ideally be mapped using vm_insert_page() or a
+ * similar mechanism, that is as a VM_MIXEDMAP mapping.
+ *
+ * However this won't work here, because:
+ *
+ * 1. It uses vma->vm_page_prot, but this field has not been completely
+ * setup at the point of the f_op->mmp() hook, so we are unable to
+ * indicate that this should be mapped CoW in order that the
+ * mkwrite() hook can be invoked to make the first page R/W and the
+ * rest R/O as desired.
+ *
+ * 2. Anything other than a VM_PFNMAP of valid PFNs will result in
+ * vm_normal_page() returning a struct page * pointer, which means
+ * vm_ops->page_mkwrite() will be invoked rather than
+ * vm_ops->pfn_mkwrite(), and this means we have to set page->mapping
+ * to work around retry logic in the fault handler, however this
+ * field is no longer allowed to be used within struct page.
+ *
+ * 3. Having a struct page * made available in the fault logic also
+ * means that the page gets put on the rmap and becomes
+ * inappropriately accessible and subject to map and ref counting.
+ *
+ * Ideally we would have a mechanism that could explicitly express our
+ * desires, but this is not currently the case, so we instead use
+ * VM_PFNMAP.
+ *
+ * We manage the lifetime of these mappings with internal refcounts (see
+ * perf_mmap_open() and perf_mmap_close()) so we ensure the lifetime of
+ * this mapping is maintained correctly.
+ */
+ for (pagenum = 0; pagenum < nr_pages; pagenum++) {
+ unsigned long va = vma->vm_start + PAGE_SIZE * pagenum;
+ struct page *page = perf_mmap_to_page(rb, vma->vm_pgoff + pagenum);
+
+ if (page == NULL) {
+ err = -EINVAL;
+ break;
+ }
+
+ /* Map readonly, perf_mmap_pfn_mkwrite() called on write fault. */
+ err = remap_pfn_range(vma, va, page_to_pfn(page), PAGE_SIZE,
+ vm_get_page_prot(vma->vm_flags & ~VM_SHARED));
+ if (err)
+ break;
+ }
+
+#ifdef CONFIG_MMU
+ /* Clear any partial mappings on error. */
+ if (err)
+ zap_page_range_single(vma, vma->vm_start, nr_pages * PAGE_SIZE, NULL);
+#endif
+
+ return err;
+}
+
static int perf_mmap(struct file *file, struct vm_area_struct *vma)
{
struct perf_event *event = file->private_data;
@@ -6682,6 +6721,8 @@ again:
goto again;
}
+ /* We need the rb to map pages. */
+ rb = event->rb;
goto unlock;
}
@@ -6776,6 +6817,9 @@ aux_unlock:
vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &perf_mmap_vmops;
+ if (!ret)
+ ret = map_range(rb, vma);
+
if (event->pmu->event_mapped)
event->pmu->event_mapped(event, vma->vm_mm);
@@ -10039,8 +10083,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
perf_swevent_overflow(event, 0, data, regs);
}
-static int perf_exclude_event(struct perf_event *event,
- struct pt_regs *regs)
+int perf_exclude_event(struct perf_event *event, struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
return 1;
@@ -10425,9 +10468,9 @@ static struct pmu perf_tracepoint = {
};
static int perf_tp_filter_match(struct perf_event *event,
- struct perf_sample_data *data)
+ struct perf_raw_record *raw)
{
- void *record = data->raw->frag.data;
+ void *record = raw->frag.data;
/* only top level events have filters set */
if (event->parent)
@@ -10439,7 +10482,7 @@ static int perf_tp_filter_match(struct perf_event *event,
}
static int perf_tp_event_match(struct perf_event *event,
- struct perf_sample_data *data,
+ struct perf_raw_record *raw,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
@@ -10450,7 +10493,7 @@ static int perf_tp_event_match(struct perf_event *event,
if (event->attr.exclude_kernel && !user_mode(regs))
return 0;
- if (!perf_tp_filter_match(event, data))
+ if (!perf_tp_filter_match(event, raw))
return 0;
return 1;
@@ -10476,6 +10519,7 @@ EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
static void __perf_tp_event_target_task(u64 count, void *record,
struct pt_regs *regs,
struct perf_sample_data *data,
+ struct perf_raw_record *raw,
struct perf_event *event)
{
struct trace_entry *entry = record;
@@ -10485,13 +10529,17 @@ static void __perf_tp_event_target_task(u64 count, void *record,
/* Cannot deliver synchronous signal to other task. */
if (event->attr.sigtrap)
return;
- if (perf_tp_event_match(event, data, regs))
+ if (perf_tp_event_match(event, raw, regs)) {
+ perf_sample_data_init(data, 0, 0);
+ perf_sample_save_raw_data(data, event, raw);
perf_swevent_event(event, count, data, regs);
+ }
}
static void perf_tp_event_target_task(u64 count, void *record,
struct pt_regs *regs,
struct perf_sample_data *data,
+ struct perf_raw_record *raw,
struct perf_event_context *ctx)
{
unsigned int cpu = smp_processor_id();
@@ -10499,15 +10547,15 @@ static void perf_tp_event_target_task(u64 count, void *record,
struct perf_event *event, *sibling;
perf_event_groups_for_cpu_pmu(event, &ctx->pinned_groups, cpu, pmu) {
- __perf_tp_event_target_task(count, record, regs, data, event);
+ __perf_tp_event_target_task(count, record, regs, data, raw, event);
for_each_sibling_event(sibling, event)
- __perf_tp_event_target_task(count, record, regs, data, sibling);
+ __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
}
perf_event_groups_for_cpu_pmu(event, &ctx->flexible_groups, cpu, pmu) {
- __perf_tp_event_target_task(count, record, regs, data, event);
+ __perf_tp_event_target_task(count, record, regs, data, raw, event);
for_each_sibling_event(sibling, event)
- __perf_tp_event_target_task(count, record, regs, data, sibling);
+ __perf_tp_event_target_task(count, record, regs, data, raw, sibling);
}
}
@@ -10525,15 +10573,10 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
},
};
- perf_sample_data_init(&data, 0, 0);
- perf_sample_save_raw_data(&data, &raw);
-
perf_trace_buf_update(record, event_type);
hlist_for_each_entry_rcu(event, head, hlist_entry) {
- if (perf_tp_event_match(event, &data, regs)) {
- perf_swevent_event(event, count, &data, regs);
-
+ if (perf_tp_event_match(event, &raw, regs)) {
/*
* Here use the same on-stack perf_sample_data,
* some members in data are event-specific and
@@ -10543,7 +10586,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
* because data->sample_flags is set.
*/
perf_sample_data_init(&data, 0, 0);
- perf_sample_save_raw_data(&data, &raw);
+ perf_sample_save_raw_data(&data, event, &raw);
+ perf_swevent_event(event, count, &data, regs);
}
}
@@ -10560,7 +10604,7 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
goto unlock;
raw_spin_lock(&ctx->lock);
- perf_tp_event_target_task(count, record, regs, &data, ctx);
+ perf_tp_event_target_task(count, record, regs, &data, &raw, ctx);
raw_spin_unlock(&ctx->lock);
unlock:
rcu_read_unlock();
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4f46f688d0d49..180509132d4b6 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -643,7 +643,6 @@ static void rb_free_aux_page(struct perf_buffer *rb, int idx)
struct page *page = virt_to_page(rb->aux_pages[idx]);
ClearPagePrivate(page);
- page->mapping = NULL;
__free_page(page);
}
@@ -819,7 +818,6 @@ static void perf_mmap_free_page(void *addr)
{
struct page *page = virt_to_page(addr);
- page->mapping = NULL;
__free_page(page);
}
@@ -890,28 +888,13 @@ __perf_mmap_to_page(struct perf_buffer *rb, unsigned long pgoff)
return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
}
-static void perf_mmap_unmark_page(void *addr)
-{
- struct page *page = vmalloc_to_page(addr);
-
- page->mapping = NULL;
-}
-
static void rb_free_work(struct work_struct *work)
{
struct perf_buffer *rb;
- void *base;
- int i, nr;
rb = container_of(work, struct perf_buffer, work);
- nr = data_page_nr(rb);
-
- base = rb->user_page;
- /* The '<=' counts in the user page. */
- for (i = 0; i <= nr; i++)
- perf_mmap_unmark_page(base + (i * PAGE_SIZE));
- vfree(base);
+ vfree(rb->user_page);
kfree(rb);
}
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index fa04b14a7d723..bf2a87a0a3787 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -28,6 +28,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/workqueue.h>
#include <linux/srcu.h>
+#include <linux/oom.h> /* check_stable_address_space */
#include <linux/uprobes.h>
@@ -416,7 +417,7 @@ static void update_ref_ctr_warn(struct uprobe *uprobe,
struct mm_struct *mm, short d)
{
pr_warn("ref_ctr %s failed for inode: 0x%lx offset: "
- "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%pK\n",
+ "0x%llx ref_ctr_offset: 0x%llx of mm: 0x%p\n",
d > 0 ? "increment" : "decrement", uprobe->inode->i_ino,
(unsigned long long) uprobe->offset,
(unsigned long long) uprobe->ref_ctr_offset, mm);
@@ -1260,6 +1261,9 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
* returns NULL in find_active_uprobe_rcu().
*/
mmap_write_lock(mm);
+ if (check_stable_address_space(mm))
+ goto unlock;
+
vma = find_vma(mm, info->vaddr);
if (!vma || !valid_vma(vma, is_register) ||
file_inode(vma->vm_file) != uprobe->inode)
@@ -1888,9 +1892,33 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
return instruction_pointer(regs);
}
-static struct return_instance *free_ret_instance(struct return_instance *ri, bool cleanup_hprobe)
+static void ri_pool_push(struct uprobe_task *utask, struct return_instance *ri)
+{
+ ri->cons_cnt = 0;
+ ri->next = utask->ri_pool;
+ utask->ri_pool = ri;
+}
+
+static struct return_instance *ri_pool_pop(struct uprobe_task *utask)
{
- struct return_instance *next = ri->next;
+ struct return_instance *ri = utask->ri_pool;
+
+ if (likely(ri))
+ utask->ri_pool = ri->next;
+
+ return ri;
+}
+
+static void ri_free(struct return_instance *ri)
+{
+ kfree(ri->extra_consumers);
+ kfree_rcu(ri, rcu);
+}
+
+static void free_ret_instance(struct uprobe_task *utask,
+ struct return_instance *ri, bool cleanup_hprobe)
+{
+ unsigned seq;
if (cleanup_hprobe) {
enum hprobe_state hstate;
@@ -1899,8 +1927,22 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo
hprobe_finalize(&ri->hprobe, hstate);
}
- kfree_rcu(ri, rcu);
- return next;
+ /*
+ * At this point return_instance is unlinked from utask's
+ * return_instances list and this has become visible to ri_timer().
+ * If seqcount now indicates that ri_timer's return instance
+ * processing loop isn't active, we can return ri into the pool of
+ * to-be-reused return instances for future uretprobes. If ri_timer()
+ * happens to be running right now, though, we fallback to safety and
+ * just perform RCU-delated freeing of ri.
+ */
+ if (raw_seqcount_try_begin(&utask->ri_seqcount, seq)) {
+ /* immediate reuse of ri without RCU GP is OK */
+ ri_pool_push(utask, ri);
+ } else {
+ /* we might be racing with ri_timer(), so play it safe */
+ ri_free(ri);
+ }
}
/*
@@ -1910,21 +1952,32 @@ static struct return_instance *free_ret_instance(struct return_instance *ri, boo
void uprobe_free_utask(struct task_struct *t)
{
struct uprobe_task *utask = t->utask;
- struct return_instance *ri;
+ struct return_instance *ri, *ri_next;
if (!utask)
return;
+ t->utask = NULL;
WARN_ON_ONCE(utask->active_uprobe || utask->xol_vaddr);
timer_delete_sync(&utask->ri_timer);
ri = utask->return_instances;
- while (ri)
- ri = free_ret_instance(ri, true /* cleanup_hprobe */);
+ while (ri) {
+ ri_next = ri->next;
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
+ }
+
+ /* free_ret_instance() above might add to ri_pool, so this loop should come last */
+ ri = utask->ri_pool;
+ while (ri) {
+ ri_next = ri->next;
+ ri_free(ri);
+ ri = ri_next;
+ }
kfree(utask);
- t->utask = NULL;
}
#define RI_TIMER_PERIOD (HZ / 10) /* 100 ms */
@@ -1942,8 +1995,12 @@ static void ri_timer(struct timer_list *timer)
/* RCU protects return_instance from freeing. */
guard(rcu)();
+ write_seqcount_begin(&utask->ri_seqcount);
+
for_each_ret_instance_rcu(ri, utask->return_instances)
hprobe_expire(&ri->hprobe, false);
+
+ write_seqcount_end(&utask->ri_seqcount);
}
static struct uprobe_task *alloc_utask(void)
@@ -1955,6 +2012,7 @@ static struct uprobe_task *alloc_utask(void)
return NULL;
timer_setup(&utask->ri_timer, ri_timer, 0);
+ seqcount_init(&utask->ri_seqcount);
return utask;
}
@@ -1974,32 +2032,40 @@ static struct uprobe_task *get_utask(void)
return current->utask;
}
-static size_t ri_size(int consumers_cnt)
+static struct return_instance *alloc_return_instance(struct uprobe_task *utask)
{
struct return_instance *ri;
- return sizeof(*ri) + sizeof(ri->consumers[0]) * consumers_cnt;
-}
-
-#define DEF_CNT 4
-
-static struct return_instance *alloc_return_instance(void)
-{
- struct return_instance *ri;
+ ri = ri_pool_pop(utask);
+ if (ri)
+ return ri;
- ri = kzalloc(ri_size(DEF_CNT), GFP_KERNEL);
+ ri = kzalloc(sizeof(*ri), GFP_KERNEL);
if (!ri)
return ZERO_SIZE_PTR;
- ri->consumers_cnt = DEF_CNT;
return ri;
}
static struct return_instance *dup_return_instance(struct return_instance *old)
{
- size_t size = ri_size(old->consumers_cnt);
+ struct return_instance *ri;
+
+ ri = kmemdup(old, sizeof(*ri), GFP_KERNEL);
+ if (!ri)
+ return NULL;
+
+ if (unlikely(old->cons_cnt > 1)) {
+ ri->extra_consumers = kmemdup(old->extra_consumers,
+ sizeof(ri->extra_consumers[0]) * (old->cons_cnt - 1),
+ GFP_KERNEL);
+ if (!ri->extra_consumers) {
+ kfree(ri);
+ return NULL;
+ }
+ }
- return kmemdup(old, size, GFP_KERNEL);
+ return ri;
}
static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
@@ -2108,14 +2174,17 @@ unsigned long uprobe_get_trampoline_vaddr(void)
static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
struct pt_regs *regs)
{
- struct return_instance *ri = utask->return_instances;
+ struct return_instance *ri = utask->return_instances, *ri_next;
enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
- ri = free_ret_instance(ri, true /* cleanup_hprobe */);
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
utask->depth--;
+
+ free_ret_instance(utask, ri, true /* cleanup_hprobe */);
+ ri = ri_next;
}
- rcu_assign_pointer(utask->return_instances, ri);
}
static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
@@ -2180,7 +2249,7 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs,
return;
free:
- kfree(ri);
+ ri_free(ri);
}
/* Prepare to single-step probed instruction out of line. */
@@ -2294,6 +2363,47 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
return is_trap_insn(&opcode);
}
+static struct uprobe *find_active_uprobe_speculative(unsigned long bp_vaddr)
+{
+ struct mm_struct *mm = current->mm;
+ struct uprobe *uprobe = NULL;
+ struct vm_area_struct *vma;
+ struct file *vm_file;
+ loff_t offset;
+ unsigned int seq;
+
+ guard(rcu)();
+
+ if (!mmap_lock_speculate_try_begin(mm, &seq))
+ return NULL;
+
+ vma = vma_lookup(mm, bp_vaddr);
+ if (!vma)
+ return NULL;
+
+ /*
+ * vm_file memory can be reused for another instance of struct file,
+ * but can't be freed from under us, so it's safe to read fields from
+ * it, even if the values are some garbage values; ultimately
+ * find_uprobe_rcu() + mmap_lock_speculation_end() check will ensure
+ * that whatever we speculatively found is correct
+ */
+ vm_file = READ_ONCE(vma->vm_file);
+ if (!vm_file)
+ return NULL;
+
+ offset = (loff_t)(vma->vm_pgoff << PAGE_SHIFT) + (bp_vaddr - vma->vm_start);
+ uprobe = find_uprobe_rcu(vm_file->f_inode, offset);
+ if (!uprobe)
+ return NULL;
+
+ /* now double check that nothing about MM changed */
+ if (mmap_lock_speculate_retry(mm, seq))
+ return NULL;
+
+ return uprobe;
+}
+
/* assumes being inside RCU protected region */
static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swbp)
{
@@ -2301,10 +2411,14 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
struct uprobe *uprobe = NULL;
struct vm_area_struct *vma;
+ uprobe = find_active_uprobe_speculative(bp_vaddr);
+ if (uprobe)
+ return uprobe;
+
mmap_read_lock(mm);
vma = vma_lookup(mm, bp_vaddr);
if (vma) {
- if (valid_vma(vma, false)) {
+ if (vma->vm_file) {
struct inode *inode = file_inode(vma->vm_file);
loff_t offset = vaddr_to_offset(vma, bp_vaddr);
@@ -2324,25 +2438,27 @@ static struct uprobe *find_active_uprobe_rcu(unsigned long bp_vaddr, int *is_swb
return uprobe;
}
-static struct return_instance*
-push_consumer(struct return_instance *ri, int idx, __u64 id, __u64 cookie)
+static struct return_instance *push_consumer(struct return_instance *ri, __u64 id, __u64 cookie)
{
+ struct return_consumer *ric;
+
if (unlikely(ri == ZERO_SIZE_PTR))
return ri;
- if (unlikely(idx >= ri->consumers_cnt)) {
- struct return_instance *old_ri = ri;
-
- ri->consumers_cnt += DEF_CNT;
- ri = krealloc(old_ri, ri_size(old_ri->consumers_cnt), GFP_KERNEL);
- if (!ri) {
- kfree(old_ri);
+ if (unlikely(ri->cons_cnt > 0)) {
+ ric = krealloc(ri->extra_consumers, sizeof(*ric) * ri->cons_cnt, GFP_KERNEL);
+ if (!ric) {
+ ri_free(ri);
return ZERO_SIZE_PTR;
}
+ ri->extra_consumers = ric;
}
- ri->consumers[idx].id = id;
- ri->consumers[idx].cookie = cookie;
+ ric = likely(ri->cons_cnt == 0) ? &ri->consumer : &ri->extra_consumers[ri->cons_cnt - 1];
+ ric->id = id;
+ ric->cookie = cookie;
+
+ ri->cons_cnt++;
return ri;
}
@@ -2350,14 +2466,17 @@ static struct return_consumer *
return_consumer_find(struct return_instance *ri, int *iter, int id)
{
struct return_consumer *ric;
- int idx = *iter;
+ int idx;
- for (ric = &ri->consumers[idx]; idx < ri->consumers_cnt; idx++, ric++) {
+ for (idx = *iter; idx < ri->cons_cnt; idx++)
+ {
+ ric = likely(idx == 0) ? &ri->consumer : &ri->extra_consumers[idx - 1];
if (ric->id == id) {
*iter = idx + 1;
return ric;
}
}
+
return NULL;
}
@@ -2371,9 +2490,9 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
struct uprobe_consumer *uc;
bool has_consumers = false, remove = true;
struct return_instance *ri = NULL;
- int push_idx = 0;
+ struct uprobe_task *utask = current->utask;
- current->utask->auprobe = &uprobe->arch;
+ utask->auprobe = &uprobe->arch;
list_for_each_entry_rcu(uc, &uprobe->consumers, cons_node, rcu_read_lock_trace_held()) {
bool session = uc->handler && uc->ret_handler;
@@ -2393,21 +2512,15 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
continue;
if (!ri)
- ri = alloc_return_instance();
+ ri = alloc_return_instance(utask);
if (session)
- ri = push_consumer(ri, push_idx++, uc->id, cookie);
+ ri = push_consumer(ri, uc->id, cookie);
}
- current->utask->auprobe = NULL;
+ utask->auprobe = NULL;
- if (!ZERO_OR_NULL_PTR(ri)) {
- /*
- * The push_idx value has the final number of return consumers,
- * and ri->consumers_cnt has number of allocated consumers.
- */
- ri->consumers_cnt = push_idx;
+ if (!ZERO_OR_NULL_PTR(ri))
prepare_uretprobe(uprobe, regs, ri);
- }
if (remove && has_consumers) {
down_read(&uprobe->register_rwsem);
@@ -2461,7 +2574,7 @@ static struct return_instance *find_next_ret_chain(struct return_instance *ri)
void uprobe_handle_trampoline(struct pt_regs *regs)
{
struct uprobe_task *utask;
- struct return_instance *ri, *next;
+ struct return_instance *ri, *ri_next, *next_chain;
struct uprobe *uprobe;
enum hprobe_state hstate;
bool valid;
@@ -2481,8 +2594,8 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
* or NULL; the latter case means that nobody but ri->func
* could hit this trampoline on return. TODO: sigaltstack().
*/
- next = find_next_ret_chain(ri);
- valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
+ next_chain = find_next_ret_chain(ri);
+ valid = !next_chain || arch_uretprobe_is_alive(next_chain, RP_CHECK_RET, regs);
instruction_pointer_set(regs, ri->orig_ret_vaddr);
do {
@@ -2494,7 +2607,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
* trampoline addresses on the stack are replaced with correct
* original return addresses
*/
- rcu_assign_pointer(utask->return_instances, ri->next);
+ ri_next = ri->next;
+ rcu_assign_pointer(utask->return_instances, ri_next);
+ utask->depth--;
uprobe = hprobe_consume(&ri->hprobe, &hstate);
if (valid)
@@ -2502,9 +2617,9 @@ void uprobe_handle_trampoline(struct pt_regs *regs)
hprobe_finalize(&ri->hprobe, hstate);
/* We already took care of hprobe, no need to waste more time on that. */
- ri = free_ret_instance(ri, false /* !cleanup_hprobe */);
- utask->depth--;
- } while (ri != next);
+ free_ret_instance(utask, ri, false /* !cleanup_hprobe */);
+ ri = ri_next;
+ } while (ri != next_chain);
} while (!valid);
return;
diff --git a/kernel/exit.c b/kernel/exit.c
index 1dcddfe537ee3..3485e5fc499e4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -85,7 +85,7 @@
static unsigned int oops_limit = 10000;
#ifdef CONFIG_SYSCTL
-static struct ctl_table kern_exit_table[] = {
+static const struct ctl_table kern_exit_table[] = {
{
.procname = "oops_limit",
.data = &oops_limit,
diff --git a/kernel/fork.c b/kernel/fork.c
index 1450b461d196a..735405a9c5f32 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -448,7 +448,7 @@ static bool vma_lock_alloc(struct vm_area_struct *vma)
return false;
init_rwsem(&vma->vm_lock->lock);
- vma->vm_lock_seq = -1;
+ vma->vm_lock_seq = UINT_MAX;
return true;
}
@@ -625,8 +625,8 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
* We depend on the oldmm having properly denied write access to the
* exe_file already.
*/
- if (exe_file && deny_write_access(exe_file))
- pr_warn_once("deny_write_access() failed in %s\n", __func__);
+ if (exe_file && exe_file_deny_write_access(exe_file))
+ pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
@@ -639,11 +639,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
LIST_HEAD(uf);
VMA_ITERATOR(vmi, mm, 0);
- uprobe_start_dup_mmap();
- if (mmap_write_lock_killable(oldmm)) {
- retval = -EINTR;
- goto fail_uprobe_end;
- }
+ if (mmap_write_lock_killable(oldmm))
+ return -EINTR;
flush_cache_dup_mm(oldmm);
uprobe_dup_mmap(oldmm, mm);
/*
@@ -763,7 +760,8 @@ loop_out:
mt_set_in_rcu(vmi.mas.tree);
ksm_fork(mm, oldmm);
khugepaged_fork(mm, oldmm);
- } else if (mpnt) {
+ } else {
+
/*
* The entire maple tree has already been duplicated. If the
* mmap duplication fails, mark the failure point with
@@ -771,8 +769,18 @@ loop_out:
* stop releasing VMAs that have not been duplicated after this
* point.
*/
- mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
- mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ if (mpnt) {
+ mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1);
+ mas_store(&vmi.mas, XA_ZERO_ENTRY);
+ /* Avoid OOM iterating a broken tree */
+ set_bit(MMF_OOM_SKIP, &mm->flags);
+ }
+ /*
+ * The mm_struct is going to exit, but the locks will be dropped
+ * first. Set the mm_struct as unstable is advisable as it is
+ * not fully initialised.
+ */
+ set_bit(MMF_UNSTABLE, &mm->flags);
}
out:
mmap_write_unlock(mm);
@@ -782,8 +790,6 @@ out:
dup_userfaultfd_complete(&uf);
else
dup_userfaultfd_fail(&uf);
-fail_uprobe_end:
- uprobe_end_dup_mmap();
return retval;
fail_nomem_anon_vma_fork:
@@ -1267,9 +1273,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
-#ifdef CONFIG_PER_VMA_LOCK
- mm->mm_lock_seq = 0;
-#endif
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@@ -1424,13 +1427,13 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
* We expect the caller (i.e., sys_execve) to already denied
* write access, so this is unlikely to fail.
*/
- if (unlikely(deny_write_access(new_exe_file)))
+ if (unlikely(exe_file_deny_write_access(new_exe_file)))
return -EACCES;
get_file(new_exe_file);
}
rcu_assign_pointer(mm->exe_file, new_exe_file);
if (old_exe_file) {
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
@@ -1471,7 +1474,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- ret = deny_write_access(new_exe_file);
+ ret = exe_file_deny_write_access(new_exe_file);
if (ret)
return -EACCES;
get_file(new_exe_file);
@@ -1483,7 +1486,7 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
mmap_write_unlock(mm);
if (old_exe_file) {
- allow_write_access(old_exe_file);
+ exe_file_allow_write_access(old_exe_file);
fput(old_exe_file);
}
return 0;
@@ -1519,12 +1522,13 @@ struct file *get_task_exe_file(struct task_struct *task)
struct file *exe_file = NULL;
struct mm_struct *mm;
+ if (task->flags & PF_KTHREAD)
+ return NULL;
+
task_lock(task);
mm = task->mm;
- if (mm) {
- if (!(task->flags & PF_KTHREAD))
- exe_file = get_mm_exe_file(mm);
- }
+ if (mm)
+ exe_file = get_mm_exe_file(mm);
task_unlock(task);
return exe_file;
}
@@ -1692,9 +1696,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk,
if (!mm_init(mm, tsk, mm->user_ns))
goto fail_nomem;
+ uprobe_start_dup_mmap();
err = dup_mmap(mm, oldmm);
if (err)
goto free_pt;
+ uprobe_end_dup_mmap();
mm->hiwater_rss = get_mm_rss(mm);
mm->hiwater_vm = mm->total_vm;
@@ -1709,6 +1715,8 @@ free_pt:
mm->binfmt = NULL;
mm_init_owner(mm, NULL);
mmput(mm);
+ if (err)
+ uprobe_end_dup_mmap();
fail_nomem:
return NULL;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index ebdd76b4ecbba..3db8567f5a44e 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -532,7 +532,8 @@ void futex_q_unlock(struct futex_hash_bucket *hb)
futex_hb_waiters_dec(hb);
}
-void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
{
int prio;
@@ -548,7 +549,7 @@ void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
- q->task = current;
+ q->task = task;
}
/**
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 99b32e728c4ad..6b2f4c7eb720f 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -285,13 +285,15 @@ static inline int futex_get_value_locked(u32 *dest, u32 __user *from)
}
extern void __futex_unqueue(struct futex_q *q);
-extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb);
+extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task);
extern int futex_unqueue(struct futex_q *q);
/**
* futex_queue() - Enqueue the futex_q on the futex_hash_bucket
* @q: The futex_q to enqueue
* @hb: The destination hash bucket
+ * @task: Task queueing this futex
*
* The hb->lock must be held by the caller, and is released here. A call to
* futex_queue() is typically paired with exactly one call to futex_unqueue(). The
@@ -299,11 +301,14 @@ extern int futex_unqueue(struct futex_q *q);
* or nothing if the unqueue is done as part of the wake process and the unqueue
* state is implicit in the state of woken task (see futex_wait_requeue_pi() for
* an example).
+ *
+ * Note that @task may be NULL, for async usage of futexes.
*/
-static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void futex_queue(struct futex_q *q, struct futex_hash_bucket *hb,
+ struct task_struct *task)
__releases(&hb->lock)
{
- __futex_queue(q, hb);
+ __futex_queue(q, hb, task);
spin_unlock(&hb->lock);
}
diff --git a/kernel/futex/pi.c b/kernel/futex/pi.c
index d62cca5ed8f4c..7a941845f7eee 100644
--- a/kernel/futex/pi.c
+++ b/kernel/futex/pi.c
@@ -982,7 +982,7 @@ retry_private:
/*
* Only actually queue now that the atomic ops are done:
*/
- __futex_queue(&q, hb);
+ __futex_queue(&q, hb, current);
if (trylock) {
ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
@@ -1020,10 +1020,7 @@ retry_private:
* it sees the futex_q::pi_state.
*/
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current, &wake_q);
- preempt_disable();
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irq_wake(&q.pi_state->pi_mutex.wait_lock, &wake_q);
if (ret) {
if (ret == 1)
diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c
index 3a10375d95218..25877d4f2f8f3 100644
--- a/kernel/futex/waitwake.c
+++ b/kernel/futex/waitwake.c
@@ -210,13 +210,12 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr)
if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) {
if (oparg < 0 || oparg > 31) {
- char comm[sizeof(current->comm)];
/*
* kill this print and return -EINVAL when userspace
* is sane again
*/
pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n",
- get_task_comm(comm, current), oparg);
+ current->comm, oparg);
oparg &= 31;
}
oparg = 1 << oparg;
@@ -350,7 +349,7 @@ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q,
* access to the hash list and forcing another memory barrier.
*/
set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
- futex_queue(q, hb);
+ futex_queue(q, hb, current);
/* Arm the timer */
if (timeout)
@@ -461,7 +460,7 @@ retry:
* next futex. Queue each futex at this moment so hb can
* be unlocked.
*/
- futex_queue(q, hb);
+ futex_queue(q, hb, current);
continue;
}
diff --git a/kernel/gcov/clang.c b/kernel/gcov/clang.c
index 7670a811a5657..8b888a6193ccf 100644
--- a/kernel/gcov/clang.c
+++ b/kernel/gcov/clang.c
@@ -264,10 +264,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2)
/**
* gcov_info_add - add up profiling data
- * @dest: profiling data set to which data is added
- * @source: profiling data set which is added
+ * @dst: profiling data set to which data is added
+ * @src: profiling data set which is added
*
- * Adds profiling counts of @source to @dest.
+ * Adds profiling counts of @src to @dst.
*/
void gcov_info_add(struct gcov_info *dst, struct gcov_info *src)
{
diff --git a/kernel/gen_kheaders.sh b/kernel/gen_kheaders.sh
index 383fd43ac6122..00529c81cc401 100755
--- a/kernel/gen_kheaders.sh
+++ b/kernel/gen_kheaders.sh
@@ -7,20 +7,13 @@ set -e
sfile="$(readlink -f "$0")"
outdir="$(pwd)"
tarfile=$1
-cpio_dir=$outdir/${tarfile%/*}/.tmp_cpio_dir
+tmpdir=$outdir/${tarfile%/*}/.tmp_dir
dir_list="
include/
arch/$SRCARCH/include/
"
-if ! command -v cpio >/dev/null; then
- echo >&2 "***"
- echo >&2 "*** 'cpio' could not be found."
- echo >&2 "***"
- exit 1
-fi
-
# Support incremental builds by skipping archive generation
# if timestamps of files being archived are not changed.
@@ -48,9 +41,9 @@ all_dirs="$all_dirs $dir_list"
# check include/generated/autoconf.h explicitly.
#
# Ignore them for md5 calculation to avoid pointless regeneration.
-headers_md5="$(find $all_dirs -name "*.h" |
- grep -v "include/generated/utsversion.h" |
- grep -v "include/generated/autoconf.h" |
+headers_md5="$(find $all_dirs -name "*.h" -a \
+ ! -path include/generated/utsversion.h -a \
+ ! -path include/generated/autoconf.h |
xargs ls -l | md5sum | cut -d ' ' -f1)"
# Any changes to this script will also cause a rebuild of the archive.
@@ -65,35 +58,43 @@ fi
echo " GEN $tarfile"
-rm -rf $cpio_dir
-mkdir $cpio_dir
+rm -rf "${tmpdir}"
+mkdir "${tmpdir}"
if [ "$building_out_of_srctree" ]; then
(
cd $srctree
for f in $dir_list
do find "$f" -name "*.h";
- done | cpio --quiet -pd $cpio_dir
+ done | tar -c -f - -T - | tar -xf - -C "${tmpdir}"
)
fi
-# The second CPIO can complain if files already exist which can happen with out
-# of tree builds having stale headers in srctree. Just silence CPIO for now.
for f in $dir_list;
do find "$f" -name "*.h";
-done | cpio --quiet -pdu $cpio_dir >/dev/null 2>&1
+done | tar -c -f - -T - | tar -xf - -C "${tmpdir}"
+
+# Always exclude include/generated/utsversion.h
+# Otherwise, the contents of the tarball may vary depending on the build steps.
+rm -f "${tmpdir}/include/generated/utsversion.h"
# Remove comments except SDPX lines
-find $cpio_dir -type f -print0 |
- xargs -0 -P8 -n1 perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;'
+# Use a temporary file to store directory contents to prevent find/xargs from
+# seeing temporary files created by perl.
+find "${tmpdir}" -type f -print0 > "${tmpdir}.contents.txt"
+xargs -0 -P8 -n1 \
+ perl -pi -e 'BEGIN {undef $/;}; s/\/\*((?!SPDX).)*?\*\///smg;' \
+ < "${tmpdir}.contents.txt"
+rm -f "${tmpdir}.contents.txt"
# Create archive and try to normalize metadata for reproducibility.
tar "${KBUILD_BUILD_TIMESTAMP:+--mtime=$KBUILD_BUILD_TIMESTAMP}" \
+ --exclude=".__afs*" --exclude=".nfs*" \
--owner=0 --group=0 --sort=name --numeric-owner --mode=u=rw,go=r,a+X \
- -I $XZ -cf $tarfile -C $cpio_dir/ . > /dev/null
+ -I $XZ -cf $tarfile -C "${tmpdir}/" . > /dev/null
echo $headers_md5 > kernel/kheaders.md5
echo "$this_file_md5" >> kernel/kheaders.md5
echo "$(md5sum $tarfile | cut -d ' ' -f1)" >> kernel/kheaders.md5
-rm -rf $cpio_dir
+rm -rf "${tmpdir}"
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c18717189f322..04efa7a6e69bf 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -147,6 +147,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
+ if (t->flags & PF_POSTCOREDUMP)
+ pr_err(" Blocked by coredump.\n");
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
" disables this message.\n");
sched_show_task(t);
@@ -272,7 +274,7 @@ static int proc_dohung_task_timeout_secs(const struct ctl_table *table, int writ
* and hung_task_check_interval_secs
*/
static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
-static struct ctl_table hung_task_sysctls[] = {
+static const struct ctl_table hung_task_sysctls[] = {
#ifdef CONFIG_SMP
{
.procname = "hung_task_all_cpu_backtrace",
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 529adb1f58593..875f25ed6f710 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -141,6 +141,12 @@ config GENERIC_IRQ_DEBUGFS
If you don't know what to do here, say N.
+# Clear forwarded VM interrupts during kexec.
+# This option ensures the kernel clears active states for interrupts
+# forwarded to virtual machines (VMs) during a machine kexec.
+config GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD
+ bool
+
endmenu
config GENERIC_IRQ_MULTI_HANDLER
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index f19d3080bf11a..c0f44c06d69df 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,6 +1,6 @@
# SPDX-License-Identifier: GPL-2.0
-obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o kexec.o
obj-$(CONFIG_IRQ_TIMINGS) += timings.o
ifeq ($(CONFIG_TEST_IRQ_TIMINGS),y)
CFLAGS_timings.o += -DDEBUG
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 271e9139de77f..c901436ebd9f4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -1114,13 +1114,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
trigger = irqd_get_trigger_type(&desc->irq_data);
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
- IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+ IRQD_TRIGGER_MASK | IRQD_LEVEL);
if (irq_settings_has_no_balance_set(desc))
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
if (irq_settings_is_per_cpu(desc))
irqd_set(&desc->irq_data, IRQD_PER_CPU);
- if (irq_settings_can_move_pcntxt(desc))
- irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index c6ffb97966bed..ca142b9a4db3d 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -53,6 +53,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
BIT_MASK_DESCR(IRQCHIP_IMMUTABLE),
+ BIT_MASK_DESCR(IRQCHIP_MOVE_DEFERRED),
};
static void
@@ -108,7 +109,6 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_NO_BALANCING),
BIT_MASK_DESCR(IRQD_SINGLE_TARGET),
- BIT_MASK_DESCR(IRQD_MOVE_PCNTXT),
BIT_MASK_DESCR(IRQD_AFFINITY_SET),
BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING),
BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED),
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 32ffcbb87fa12..c4a8bca5f2b0f 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -162,6 +162,7 @@ void irq_gc_mask_disable_and_ack_set(struct irq_data *d)
irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
+EXPORT_SYMBOL_GPL(irq_gc_mask_disable_and_ack_set);
/**
* irq_gc_eoi - EOI interrupt
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index fe0272cd84a51..a979523640d0a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -421,7 +421,7 @@ irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
#ifdef CONFIG_GENERIC_PENDING_IRQ
static inline bool irq_can_move_pcntxt(struct irq_data *data)
{
- return irqd_can_move_in_process_context(data);
+ return !(data->chip->flags & IRQCHIP_MOVE_DEFERRED);
}
static inline bool irq_move_pending(struct irq_data *data)
{
@@ -441,10 +441,6 @@ static inline struct cpumask *irq_desc_get_pending_mask(struct irq_desc *desc)
{
return desc->pending_mask;
}
-static inline bool handle_enforce_irqctx(struct irq_data *data)
-{
- return irqd_is_handle_enforce_irqctx(data);
-}
bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
#else /* CONFIG_GENERIC_PENDING_IRQ */
static inline bool irq_can_move_pcntxt(struct irq_data *data)
@@ -471,10 +467,6 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear)
{
return false;
}
-static inline bool handle_enforce_irqctx(struct irq_data *data)
-{
- return false;
-}
#endif /* !CONFIG_GENERIC_PENDING_IRQ */
#if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 0253e77fcd9a6..2878307397833 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -708,7 +708,7 @@ int handle_irq_desc(struct irq_desc *desc)
return -EINVAL;
data = irq_desc_get_irq_data(desc);
- if (WARN_ON_ONCE(!in_hardirq() && handle_enforce_irqctx(data)))
+ if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data)))
return -EPERM;
generic_handle_irq_desc(desc);
diff --git a/kernel/irq/kexec.c b/kernel/irq/kexec.c
new file mode 100644
index 0000000000000..1a3deffe6b5b6
--- /dev/null
+++ b/kernel/irq/kexec.c
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdesc.h>
+#include <linux/irqnr.h>
+
+#include "internals.h"
+
+void machine_kexec_mask_interrupts(void)
+{
+ struct irq_desc *desc;
+ unsigned int i;
+
+ for_each_irq_desc(i, desc) {
+ struct irq_chip *chip;
+ int check_eoi = 1;
+
+ chip = irq_desc_get_chip(desc);
+ if (!chip || !irqd_is_started(&desc->irq_data))
+ continue;
+
+ if (IS_ENABLED(CONFIG_GENERIC_IRQ_KEXEC_CLEAR_VM_FORWARD)) {
+ /*
+ * First try to remove the active state from an interrupt which is forwarded
+ * to a VM. If the interrupt is not forwarded, try to EOI the interrupt.
+ */
+ check_eoi = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false);
+ }
+
+ if (check_eoi && chip->irq_eoi && irqd_irq_inprogress(&desc->irq_data))
+ chip->irq_eoi(&desc->irq_data);
+
+ irq_shutdown(desc);
+ }
+}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f0803d6bd2969..f300bb6be3bd4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1182,45 +1182,38 @@ out_unlock:
}
/*
- * Interrupts which are not explicitly requested as threaded
- * interrupts rely on the implicit bh/preempt disable of the hard irq
- * context. So we need to disable bh here to avoid deadlocks and other
- * side effects.
+ * Interrupts explicitly requested as threaded interrupts want to be
+ * preemptible - many of them need to sleep and wait for slow busses to
+ * complete.
*/
-static irqreturn_t
-irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+static irqreturn_t irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
- irqreturn_t ret;
+ irqreturn_t ret = action->thread_fn(action->irq, action->dev_id);
- local_bh_disable();
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_disable();
- ret = action->thread_fn(action->irq, action->dev_id);
if (ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
irq_finalize_oneshot(desc, action);
- if (!IS_ENABLED(CONFIG_PREEMPT_RT))
- local_irq_enable();
- local_bh_enable();
return ret;
}
/*
- * Interrupts explicitly requested as threaded interrupts want to be
- * preemptible - many of them need to sleep and wait for slow busses to
- * complete.
+ * Interrupts which are not explicitly requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
*/
-static irqreturn_t irq_thread_fn(struct irq_desc *desc,
- struct irqaction *action)
+static irqreturn_t irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
{
irqreturn_t ret;
- ret = action->thread_fn(action->irq, action->dev_id);
- if (ret == IRQ_HANDLED)
- atomic_inc(&desc->threads_handled);
-
- irq_finalize_oneshot(desc, action);
+ local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_disable();
+ ret = irq_thread_fn(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+ local_irq_enable();
+ local_bh_enable();
return ret;
}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index b07a2d732ffbc..1b7fa72968bd6 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -53,7 +53,7 @@ static int irq_sw_resend(struct irq_desc *desc)
* Validate whether this interrupt can be safely injected from
* non interrupt context
*/
- if (handle_enforce_irqctx(&desc->irq_data))
+ if (irqd_is_handle_enforce_irqctx(&desc->irq_data))
return -EINVAL;
/*
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index 7b7efb1a114bd..00b3bd127692c 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -11,7 +11,6 @@ enum {
_IRQ_NOREQUEST = IRQ_NOREQUEST,
_IRQ_NOTHREAD = IRQ_NOTHREAD,
_IRQ_NOAUTOEN = IRQ_NOAUTOEN,
- _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
_IRQ_NO_BALANCING = IRQ_NO_BALANCING,
_IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
@@ -142,11 +141,6 @@ static inline void irq_settings_set_noprobe(struct irq_desc *desc)
desc->status_use_accessors |= _IRQ_NOPROBE;
}
-static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
-{
- return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
-}
-
static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
{
return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
diff --git a/kernel/irq/timings.c b/kernel/irq/timings.c
index c43e2ac2f8def..4b7315e99bd66 100644
--- a/kernel/irq/timings.c
+++ b/kernel/irq/timings.c
@@ -509,6 +509,7 @@ static inline void irq_timings_store(int irq, struct irqt_stat *irqs, u64 ts)
/**
* irq_timings_next_event - Return when the next event is supposed to arrive
+ * @now: current time
*
* During the last busy cycle, the number of interrupts is incremented
* and stored in the irq_timings structure. This information is
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index 2f4fb336dda17..73f7e1fd4ab4d 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -147,7 +147,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (!irq_work_claim(work))
return false;
- kasan_record_aux_stack_noalloc(work);
+ kasan_record_aux_stack(work);
preempt_disable();
if (cpu != smp_processor_id()) {
diff --git a/kernel/kallsyms_selftest.c b/kernel/kallsyms_selftest.c
index 873f7c445488c..cf4af5728307b 100644
--- a/kernel/kallsyms_selftest.c
+++ b/kernel/kallsyms_selftest.c
@@ -435,13 +435,11 @@ static int __init kallsyms_test_init(void)
{
struct task_struct *t;
- t = kthread_create(test_entry, NULL, "kallsyms_test");
+ t = kthread_run_on_cpu(test_entry, NULL, 0, "kallsyms_test");
if (IS_ERR(t)) {
pr_info("Create kallsyms selftest task failed\n");
return PTR_ERR(t);
}
- kthread_bind(t, 0);
- wake_up_process(t);
return 0;
}
diff --git a/kernel/kcov.c b/kernel/kcov.c
index 28a6be6e64fdd..187ba1b80bda1 100644
--- a/kernel/kcov.c
+++ b/kernel/kcov.c
@@ -166,7 +166,7 @@ static void kcov_remote_area_put(struct kcov_remote_area *area,
* Unlike in_serving_softirq(), this function returns false when called during
* a hardirq or an NMI that happened in the softirq context.
*/
-static inline bool in_softirq_really(void)
+static __always_inline bool in_softirq_really(void)
{
return in_serving_softirq() && !in_hardirq() && !in_nmi();
}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c0caa14880c3b..c0bdc1686154d 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -925,7 +925,7 @@ static int kexec_limit_handler(const struct ctl_table *table, int write,
return proc_dointvec(&tmp, write, buffer, lenp, ppos);
}
-static struct ctl_table kexec_core_sysctls[] = {
+static const struct ctl_table kexec_core_sysctls[] = {
{
.procname = "kexec_load_disabled",
.data = &kexec_load_disabled,
@@ -1001,6 +1001,12 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
+ /*
+ * This flow is analogous to hibernation flows that occur
+ * before creating an image and before jumping from the
+ * restore kernel to the image one, so it uses the same
+ * device callbacks as those two flows.
+ */
pm_prepare_console();
error = freeze_processes();
if (error) {
@@ -1011,12 +1017,10 @@ int kernel_kexec(void)
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Resume_console;
- /* At this point, dpm_suspend_start() has been called,
- * but *not* dpm_suspend_end(). We *must* call
- * dpm_suspend_end() now. Otherwise, drivers for
- * some devices (e.g. interrupt controllers) become
- * desynchronized with the actual state of the
- * hardware at resume time, and evil weirdness ensues.
+ /*
+ * dpm_suspend_end() must be called after dpm_suspend_start()
+ * to complete the transition, like in the hibernation flows
+ * mentioned above.
*/
error = dpm_suspend_end(PMSG_FREEZE);
if (error)
@@ -1052,6 +1056,13 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
+ /*
+ * This flow is analogous to hibernation flows that occur after
+ * creating an image and after the image kernel has got control
+ * back, and in case the devices have been reset or otherwise
+ * manipulated in the meantime, it uses the device callbacks
+ * used by the latter.
+ */
syscore_resume();
Enable_irqs:
local_irq_enable();
diff --git a/kernel/kheaders.c b/kernel/kheaders.c
index 42163c9e94e55..378088b07f46d 100644
--- a/kernel/kheaders.c
+++ b/kernel/kheaders.c
@@ -29,25 +29,12 @@ asm (
extern char kernel_headers_data[];
extern char kernel_headers_data_end[];
-static ssize_t
-ikheaders_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t len)
-{
- memcpy(buf, &kernel_headers_data[off], len);
- return len;
-}
-
-static struct bin_attribute kheaders_attr __ro_after_init = {
- .attr = {
- .name = "kheaders.tar.xz",
- .mode = 0444,
- },
- .read = &ikheaders_read,
-};
+static struct bin_attribute kheaders_attr __ro_after_init =
+ __BIN_ATTR_SIMPLE_RO(kheaders.tar.xz, 0444);
static int __init ikheaders_init(void)
{
+ kheaders_attr.private = kernel_headers_data;
kheaders_attr.size = (kernel_headers_data_end -
kernel_headers_data);
return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b027a4030976a..88aeac84e4c05 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -39,6 +39,7 @@
#include <linux/static_call.h>
#include <linux/perf_event.h>
#include <linux/execmem.h>
+#include <linux/cleanup.h>
#include <asm/sections.h>
#include <asm/cacheflush.h>
@@ -140,45 +141,39 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c);
kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
- kprobe_opcode_t *slot = NULL;
/* Since the slot array is not protected by rcu, we need a mutex */
- mutex_lock(&c->mutex);
- retry:
- rcu_read_lock();
- list_for_each_entry_rcu(kip, &c->pages, list) {
- if (kip->nused < slots_per_page(c)) {
- int i;
-
- for (i = 0; i < slots_per_page(c); i++) {
- if (kip->slot_used[i] == SLOT_CLEAN) {
- kip->slot_used[i] = SLOT_USED;
- kip->nused++;
- slot = kip->insns + (i * c->insn_size);
- rcu_read_unlock();
- goto out;
+ guard(mutex)(&c->mutex);
+ do {
+ guard(rcu)();
+ list_for_each_entry_rcu(kip, &c->pages, list) {
+ if (kip->nused < slots_per_page(c)) {
+ int i;
+
+ for (i = 0; i < slots_per_page(c); i++) {
+ if (kip->slot_used[i] == SLOT_CLEAN) {
+ kip->slot_used[i] = SLOT_USED;
+ kip->nused++;
+ return kip->insns + (i * c->insn_size);
+ }
}
+ /* kip->nused is broken. Fix it. */
+ kip->nused = slots_per_page(c);
+ WARN_ON(1);
}
- /* kip->nused is broken. Fix it. */
- kip->nused = slots_per_page(c);
- WARN_ON(1);
}
- }
- rcu_read_unlock();
-
/* If there are any garbage slots, collect it and try again. */
- if (c->nr_garbage && collect_garbage_slots(c) == 0)
- goto retry;
+ } while (c->nr_garbage && collect_garbage_slots(c) == 0);
/* All out of space. Need to allocate a new page. */
kip = kmalloc(struct_size(kip, slot_used, slots_per_page(c)), GFP_KERNEL);
if (!kip)
- goto out;
+ return NULL;
kip->insns = c->alloc();
if (!kip->insns) {
kfree(kip);
- goto out;
+ return NULL;
}
INIT_LIST_HEAD(&kip->list);
memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
@@ -187,14 +182,12 @@ kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
kip->ngarbage = 0;
kip->cache = c;
list_add_rcu(&kip->list, &c->pages);
- slot = kip->insns;
/* Record the perf ksymbol register event after adding the page */
perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_OOL, (unsigned long)kip->insns,
PAGE_SIZE, false, c->sym);
-out:
- mutex_unlock(&c->mutex);
- return slot;
+
+ return kip->insns;
}
/* Return true if all garbages are collected, otherwise false. */
@@ -249,25 +242,35 @@ static int collect_garbage_slots(struct kprobe_insn_cache *c)
return 0;
}
-void __free_insn_slot(struct kprobe_insn_cache *c,
- kprobe_opcode_t *slot, int dirty)
+static long __find_insn_page(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, struct kprobe_insn_page **pkip)
{
- struct kprobe_insn_page *kip;
+ struct kprobe_insn_page *kip = NULL;
long idx;
- mutex_lock(&c->mutex);
- rcu_read_lock();
+ guard(rcu)();
list_for_each_entry_rcu(kip, &c->pages, list) {
idx = ((long)slot - (long)kip->insns) /
(c->insn_size * sizeof(kprobe_opcode_t));
- if (idx >= 0 && idx < slots_per_page(c))
- goto out;
+ if (idx >= 0 && idx < slots_per_page(c)) {
+ *pkip = kip;
+ return idx;
+ }
}
/* Could not find this slot. */
WARN_ON(1);
- kip = NULL;
-out:
- rcu_read_unlock();
+ *pkip = NULL;
+ return -1;
+}
+
+void __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
+{
+ struct kprobe_insn_page *kip = NULL;
+ long idx;
+
+ guard(mutex)(&c->mutex);
+ idx = __find_insn_page(c, slot, &kip);
/* Mark and sweep: this may sleep */
if (kip) {
/* Check double free */
@@ -281,7 +284,6 @@ out:
collect_one_slot(kip, idx);
}
}
- mutex_unlock(&c->mutex);
}
/*
@@ -600,47 +602,43 @@ static void kick_kprobe_optimizer(void)
/* Kprobe jump optimizer */
static void kprobe_optimizer(struct work_struct *work)
{
- mutex_lock(&kprobe_mutex);
- cpus_read_lock();
- mutex_lock(&text_mutex);
+ guard(mutex)(&kprobe_mutex);
- /*
- * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
- * kprobes before waiting for quiesence period.
- */
- do_unoptimize_kprobes();
+ scoped_guard(cpus_read_lock) {
+ guard(mutex)(&text_mutex);
- /*
- * Step 2: Wait for quiesence period to ensure all potentially
- * preempted tasks to have normally scheduled. Because optprobe
- * may modify multiple instructions, there is a chance that Nth
- * instruction is preempted. In that case, such tasks can return
- * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
- * Note that on non-preemptive kernel, this is transparently converted
- * to synchronoze_sched() to wait for all interrupts to have completed.
- */
- synchronize_rcu_tasks();
+ /*
+ * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed)
+ * kprobes before waiting for quiesence period.
+ */
+ do_unoptimize_kprobes();
- /* Step 3: Optimize kprobes after quiesence period */
- do_optimize_kprobes();
+ /*
+ * Step 2: Wait for quiesence period to ensure all potentially
+ * preempted tasks to have normally scheduled. Because optprobe
+ * may modify multiple instructions, there is a chance that Nth
+ * instruction is preempted. In that case, such tasks can return
+ * to 2nd-Nth byte of jump instruction. This wait is for avoiding it.
+ * Note that on non-preemptive kernel, this is transparently converted
+ * to synchronoze_sched() to wait for all interrupts to have completed.
+ */
+ synchronize_rcu_tasks();
- /* Step 4: Free cleaned kprobes after quiesence period */
- do_free_cleaned_kprobes();
+ /* Step 3: Optimize kprobes after quiesence period */
+ do_optimize_kprobes();
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
+ /* Step 4: Free cleaned kprobes after quiesence period */
+ do_free_cleaned_kprobes();
+ }
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
-
- mutex_unlock(&kprobe_mutex);
}
-/* Wait for completing optimization and unoptimization */
-void wait_for_kprobe_optimizer(void)
+static void wait_for_kprobe_optimizer_locked(void)
{
- mutex_lock(&kprobe_mutex);
+ lockdep_assert_held(&kprobe_mutex);
while (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) {
mutex_unlock(&kprobe_mutex);
@@ -652,8 +650,14 @@ void wait_for_kprobe_optimizer(void)
mutex_lock(&kprobe_mutex);
}
+}
- mutex_unlock(&kprobe_mutex);
+/* Wait for completing optimization and unoptimization */
+void wait_for_kprobe_optimizer(void)
+{
+ guard(mutex)(&kprobe_mutex);
+
+ wait_for_kprobe_optimizer_locked();
}
bool optprobe_queued_unopt(struct optimized_kprobe *op)
@@ -852,29 +856,24 @@ static void try_to_optimize_kprobe(struct kprobe *p)
return;
/* For preparing optimization, jump_label_text_reserved() is called. */
- cpus_read_lock();
- jump_label_lock();
- mutex_lock(&text_mutex);
+ guard(cpus_read_lock)();
+ guard(jump_label_lock)();
+ guard(mutex)(&text_mutex);
ap = alloc_aggr_kprobe(p);
if (!ap)
- goto out;
+ return;
op = container_of(ap, struct optimized_kprobe, kp);
if (!arch_prepared_optinsn(&op->optinsn)) {
/* If failed to setup optimizing, fallback to kprobe. */
arch_remove_optimized_kprobe(op);
kfree(op);
- goto out;
+ return;
}
init_aggr_kprobe(ap, p);
optimize_kprobe(ap); /* This just kicks optimizer thread. */
-
-out:
- mutex_unlock(&text_mutex);
- jump_label_unlock();
- cpus_read_unlock();
}
static void optimize_all_kprobes(void)
@@ -883,10 +882,10 @@ static void optimize_all_kprobes(void)
struct kprobe *p;
unsigned int i;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If optimization is already allowed, just return. */
if (kprobes_allow_optimization)
- goto out;
+ return;
cpus_read_lock();
kprobes_allow_optimization = true;
@@ -898,8 +897,6 @@ static void optimize_all_kprobes(void)
}
cpus_read_unlock();
pr_info("kprobe jump-optimization is enabled. All kprobes are optimized if possible.\n");
-out:
- mutex_unlock(&kprobe_mutex);
}
#ifdef CONFIG_SYSCTL
@@ -909,12 +906,10 @@ static void unoptimize_all_kprobes(void)
struct kprobe *p;
unsigned int i;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If optimization is already prohibited, just return. */
- if (!kprobes_allow_optimization) {
- mutex_unlock(&kprobe_mutex);
+ if (!kprobes_allow_optimization)
return;
- }
cpus_read_lock();
kprobes_allow_optimization = false;
@@ -926,10 +921,8 @@ static void unoptimize_all_kprobes(void)
}
}
cpus_read_unlock();
- mutex_unlock(&kprobe_mutex);
-
/* Wait for unoptimizing completion. */
- wait_for_kprobe_optimizer();
+ wait_for_kprobe_optimizer_locked();
pr_info("kprobe jump-optimization is disabled. All kprobes are based on software breakpoint.\n");
}
@@ -941,7 +934,7 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table,
{
int ret;
- mutex_lock(&kprobe_sysctl_mutex);
+ guard(mutex)(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -949,12 +942,11 @@ static int proc_kprobes_optimization_handler(const struct ctl_table *table,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
-static struct ctl_table kprobe_sysctls[] = {
+static const struct ctl_table kprobe_sysctls[] = {
{
.procname = "kprobes-optimization",
.data = &sysctl_kprobes_optimization,
@@ -1024,7 +1016,8 @@ static void __disarm_kprobe(struct kprobe *p, bool reopt)
#define __arm_kprobe(p) arch_arm_kprobe(p)
#define __disarm_kprobe(p, o) arch_disarm_kprobe(p)
#define kprobe_disarmed(p) kprobe_disabled(p)
-#define wait_for_kprobe_optimizer() do {} while (0)
+#define wait_for_kprobe_optimizer_locked() \
+ lockdep_assert_held(&kprobe_mutex)
static int reuse_unused_kprobe(struct kprobe *ap)
{
@@ -1078,20 +1071,18 @@ static int __arm_kprobe_ftrace(struct kprobe *p, struct ftrace_ops *ops,
if (*cnt == 0) {
ret = register_ftrace_function(ops);
- if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret))
- goto err_ftrace;
+ if (WARN(ret < 0, "Failed to register kprobe-ftrace (error %d)\n", ret)) {
+ /*
+ * At this point, sinec ops is not registered, we should be sefe from
+ * registering empty filter.
+ */
+ ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
+ return ret;
+ }
}
(*cnt)++;
return ret;
-
-err_ftrace:
- /*
- * At this point, sinec ops is not registered, we should be sefe from
- * registering empty filter.
- */
- ftrace_set_filter_ip(ops, (unsigned long)p->addr, 1, 0);
- return ret;
}
static int arm_kprobe_ftrace(struct kprobe *p)
@@ -1163,12 +1154,9 @@ static int arm_kprobe(struct kprobe *kp)
if (unlikely(kprobe_ftrace(kp)))
return arm_kprobe_ftrace(kp);
- cpus_read_lock();
- mutex_lock(&text_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&text_mutex);
__arm_kprobe(kp);
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
-
return 0;
}
@@ -1177,12 +1165,9 @@ static int disarm_kprobe(struct kprobe *kp, bool reopt)
if (unlikely(kprobe_ftrace(kp)))
return disarm_kprobe_ftrace(kp);
- cpus_read_lock();
- mutex_lock(&text_mutex);
+ guard(cpus_read_lock)();
+ guard(mutex)(&text_mutex);
__disarm_kprobe(kp, reopt);
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
-
return 0;
}
@@ -1299,62 +1284,55 @@ static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
int ret = 0;
struct kprobe *ap = orig_p;
- cpus_read_lock();
-
- /* For preparing optimization, jump_label_text_reserved() is called */
- jump_label_lock();
- mutex_lock(&text_mutex);
-
- if (!kprobe_aggrprobe(orig_p)) {
- /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
- ap = alloc_aggr_kprobe(orig_p);
- if (!ap) {
- ret = -ENOMEM;
- goto out;
+ scoped_guard(cpus_read_lock) {
+ /* For preparing optimization, jump_label_text_reserved() is called */
+ guard(jump_label_lock)();
+ guard(mutex)(&text_mutex);
+
+ if (!kprobe_aggrprobe(orig_p)) {
+ /* If 'orig_p' is not an 'aggr_kprobe', create new one. */
+ ap = alloc_aggr_kprobe(orig_p);
+ if (!ap)
+ return -ENOMEM;
+ init_aggr_kprobe(ap, orig_p);
+ } else if (kprobe_unused(ap)) {
+ /* This probe is going to die. Rescue it */
+ ret = reuse_unused_kprobe(ap);
+ if (ret)
+ return ret;
}
- init_aggr_kprobe(ap, orig_p);
- } else if (kprobe_unused(ap)) {
- /* This probe is going to die. Rescue it */
- ret = reuse_unused_kprobe(ap);
- if (ret)
- goto out;
- }
- if (kprobe_gone(ap)) {
- /*
- * Attempting to insert new probe at the same location that
- * had a probe in the module vaddr area which already
- * freed. So, the instruction slot has already been
- * released. We need a new slot for the new probe.
- */
- ret = arch_prepare_kprobe(ap);
- if (ret)
+ if (kprobe_gone(ap)) {
/*
- * Even if fail to allocate new slot, don't need to
- * free the 'ap'. It will be used next time, or
- * freed by unregister_kprobe().
+ * Attempting to insert new probe at the same location that
+ * had a probe in the module vaddr area which already
+ * freed. So, the instruction slot has already been
+ * released. We need a new slot for the new probe.
*/
- goto out;
-
- /* Prepare optimized instructions if possible. */
- prepare_optimized_kprobe(ap);
+ ret = arch_prepare_kprobe(ap);
+ if (ret)
+ /*
+ * Even if fail to allocate new slot, don't need to
+ * free the 'ap'. It will be used next time, or
+ * freed by unregister_kprobe().
+ */
+ return ret;
- /*
- * Clear gone flag to prevent allocating new slot again, and
- * set disabled flag because it is not armed yet.
- */
- ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
- | KPROBE_FLAG_DISABLED;
- }
+ /* Prepare optimized instructions if possible. */
+ prepare_optimized_kprobe(ap);
- /* Copy the insn slot of 'p' to 'ap'. */
- copy_kprobe(ap, p);
- ret = add_new_kprobe(ap, p);
+ /*
+ * Clear gone flag to prevent allocating new slot again, and
+ * set disabled flag because it is not armed yet.
+ */
+ ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+ | KPROBE_FLAG_DISABLED;
+ }
-out:
- mutex_unlock(&text_mutex);
- jump_label_unlock();
- cpus_read_unlock();
+ /* Copy the insn slot of 'p' to 'ap'. */
+ copy_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
+ }
if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) {
ap->flags &= ~KPROBE_FLAG_DISABLED;
@@ -1448,7 +1426,7 @@ _kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
unsigned long offset, bool *on_func_entry)
{
if ((symbol_name && addr) || (!symbol_name && !addr))
- goto invalid;
+ return ERR_PTR(-EINVAL);
if (symbol_name) {
/*
@@ -1478,16 +1456,16 @@ _kprobe_addr(kprobe_opcode_t *addr, const char *symbol_name,
* at the start of the function.
*/
addr = arch_adjust_kprobe_addr((unsigned long)addr, offset, on_func_entry);
- if (addr)
- return addr;
+ if (!addr)
+ return ERR_PTR(-EINVAL);
-invalid:
- return ERR_PTR(-EINVAL);
+ return addr;
}
static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
bool on_func_entry;
+
return _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
}
@@ -1505,15 +1483,15 @@ static struct kprobe *__get_valid_kprobe(struct kprobe *p)
if (unlikely(!ap))
return NULL;
- if (p != ap) {
- list_for_each_entry(list_p, &ap->list, list)
- if (list_p == p)
- /* kprobe p is a valid probe */
- goto valid;
- return NULL;
- }
-valid:
- return ap;
+ if (p == ap)
+ return ap;
+
+ list_for_each_entry(list_p, &ap->list, list)
+ if (list_p == p)
+ /* kprobe p is a valid probe */
+ return ap;
+
+ return NULL;
}
/*
@@ -1522,14 +1500,12 @@ valid:
*/
static inline int warn_kprobe_rereg(struct kprobe *p)
{
- int ret = 0;
+ guard(mutex)(&kprobe_mutex);
- mutex_lock(&kprobe_mutex);
if (WARN_ON_ONCE(__get_valid_kprobe(p)))
- ret = -EINVAL;
- mutex_unlock(&kprobe_mutex);
+ return -EINVAL;
- return ret;
+ return 0;
}
static int check_ftrace_location(struct kprobe *p)
@@ -1565,17 +1541,23 @@ static int check_kprobe_address_safe(struct kprobe *p,
ret = check_ftrace_location(p);
if (ret)
return ret;
- jump_label_lock();
- preempt_disable();
+
+ guard(jump_label_lock)();
/* Ensure the address is in a text area, and find a module if exists. */
*probed_mod = NULL;
if (!core_kernel_text((unsigned long) p->addr)) {
+ guard(preempt)();
*probed_mod = __module_text_address((unsigned long) p->addr);
- if (!(*probed_mod)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!(*probed_mod))
+ return -EINVAL;
+
+ /*
+ * We must hold a refcount of the probed module while updating
+ * its code to prohibit unexpected unloading.
+ */
+ if (unlikely(!try_module_get(*probed_mod)))
+ return -ENOENT;
}
/* Ensure it is not in reserved area. */
if (in_gate_area_no_mm((unsigned long) p->addr) ||
@@ -1584,49 +1566,71 @@ static int check_kprobe_address_safe(struct kprobe *p,
static_call_text_reserved(p->addr, p->addr) ||
find_bug((unsigned long)p->addr) ||
is_cfi_preamble_symbol((unsigned long)p->addr)) {
- ret = -EINVAL;
- goto out;
+ module_put(*probed_mod);
+ return -EINVAL;
}
/* Get module refcount and reject __init functions for loaded modules. */
if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) {
/*
- * We must hold a refcount of the probed module while updating
- * its code to prohibit unexpected unloading.
- */
- if (unlikely(!try_module_get(*probed_mod))) {
- ret = -ENOENT;
- goto out;
- }
-
- /*
* If the module freed '.init.text', we couldn't insert
* kprobes in there.
*/
if (within_module_init((unsigned long)p->addr, *probed_mod) &&
!module_is_coming(*probed_mod)) {
module_put(*probed_mod);
- *probed_mod = NULL;
- ret = -ENOENT;
+ return -ENOENT;
}
}
-out:
- preempt_enable();
- jump_label_unlock();
+ return 0;
+}
- return ret;
+static int __register_kprobe(struct kprobe *p)
+{
+ int ret;
+ struct kprobe *old_p;
+
+ guard(mutex)(&kprobe_mutex);
+
+ old_p = get_kprobe(p->addr);
+ if (old_p)
+ /* Since this may unoptimize 'old_p', locking 'text_mutex'. */
+ return register_aggr_kprobe(old_p, p);
+
+ scoped_guard(cpus_read_lock) {
+ /* Prevent text modification */
+ guard(mutex)(&text_mutex);
+ ret = prepare_kprobe(p);
+ if (ret)
+ return ret;
+ }
+
+ INIT_HLIST_NODE(&p->hlist);
+ hlist_add_head_rcu(&p->hlist,
+ &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
+
+ if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
+ ret = arm_kprobe(p);
+ if (ret) {
+ hlist_del_rcu(&p->hlist);
+ synchronize_rcu();
+ }
+ }
+
+ /* Try to optimize kprobe */
+ try_to_optimize_kprobe(p);
+ return 0;
}
int register_kprobe(struct kprobe *p)
{
int ret;
- struct kprobe *old_p;
struct module *probed_mod;
kprobe_opcode_t *addr;
bool on_func_entry;
- /* Adjust probe address from symbol */
+ /* Canonicalize probe address from symbol */
addr = _kprobe_addr(p->addr, p->symbol_name, p->offset, &on_func_entry);
if (IS_ERR(addr))
return PTR_ERR(addr);
@@ -1638,6 +1642,8 @@ int register_kprobe(struct kprobe *p)
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p->flags &= KPROBE_FLAG_DISABLED;
+ if (on_func_entry)
+ p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
@@ -1645,44 +1651,7 @@ int register_kprobe(struct kprobe *p)
if (ret)
return ret;
- mutex_lock(&kprobe_mutex);
-
- if (on_func_entry)
- p->flags |= KPROBE_FLAG_ON_FUNC_ENTRY;
-
- old_p = get_kprobe(p->addr);
- if (old_p) {
- /* Since this may unoptimize 'old_p', locking 'text_mutex'. */
- ret = register_aggr_kprobe(old_p, p);
- goto out;
- }
-
- cpus_read_lock();
- /* Prevent text modification */
- mutex_lock(&text_mutex);
- ret = prepare_kprobe(p);
- mutex_unlock(&text_mutex);
- cpus_read_unlock();
- if (ret)
- goto out;
-
- INIT_HLIST_NODE(&p->hlist);
- hlist_add_head_rcu(&p->hlist,
- &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
-
- if (!kprobes_all_disarmed && !kprobe_disabled(p)) {
- ret = arm_kprobe(p);
- if (ret) {
- hlist_del_rcu(&p->hlist);
- synchronize_rcu();
- goto out;
- }
- }
-
- /* Try to optimize kprobe */
- try_to_optimize_kprobe(p);
-out:
- mutex_unlock(&kprobe_mutex);
+ ret = __register_kprobe(p);
if (probed_mod)
module_put(probed_mod);
@@ -1761,29 +1730,31 @@ static int __unregister_kprobe_top(struct kprobe *p)
if (IS_ERR(ap))
return PTR_ERR(ap);
- if (ap == p)
- /*
- * This probe is an independent(and non-optimized) kprobe
- * (not an aggrprobe). Remove from the hash list.
- */
- goto disarmed;
+ WARN_ON(ap != p && !kprobe_aggrprobe(ap));
- /* Following process expects this probe is an aggrprobe */
- WARN_ON(!kprobe_aggrprobe(ap));
-
- if (list_is_singular(&ap->list) && kprobe_disarmed(ap))
+ /*
+ * If the probe is an independent(and non-optimized) kprobe
+ * (not an aggrprobe), the last kprobe on the aggrprobe, or
+ * kprobe is already disarmed, just remove from the hash list.
+ */
+ if (ap == p ||
+ (list_is_singular(&ap->list) && kprobe_disarmed(ap))) {
/*
* !disarmed could be happen if the probe is under delayed
* unoptimizing.
*/
- goto disarmed;
- else {
- /* If disabling probe has special handlers, update aggrprobe */
- if (p->post_handler && !kprobe_gone(p)) {
- list_for_each_entry(list_p, &ap->list, list) {
- if ((list_p != p) && (list_p->post_handler))
- goto noclean;
- }
+ hlist_del_rcu(&ap->hlist);
+ return 0;
+ }
+
+ /* If disabling probe has special handlers, update aggrprobe */
+ if (p->post_handler && !kprobe_gone(p)) {
+ list_for_each_entry(list_p, &ap->list, list) {
+ if ((list_p != p) && (list_p->post_handler))
+ break;
+ }
+ /* No other probe has post_handler */
+ if (list_entry_is_head(list_p, &ap->list, list)) {
/*
* For the kprobe-on-ftrace case, we keep the
* post_handler setting to identify this aggrprobe
@@ -1792,24 +1763,21 @@ static int __unregister_kprobe_top(struct kprobe *p)
if (!kprobe_ftrace(ap))
ap->post_handler = NULL;
}
-noclean:
+ }
+
+ /*
+ * Remove from the aggrprobe: this path will do nothing in
+ * __unregister_kprobe_bottom().
+ */
+ list_del_rcu(&p->list);
+ if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
/*
- * Remove from the aggrprobe: this path will do nothing in
- * __unregister_kprobe_bottom().
+ * Try to optimize this probe again, because post
+ * handler may have been changed.
*/
- list_del_rcu(&p->list);
- if (!kprobe_disabled(ap) && !kprobes_all_disarmed)
- /*
- * Try to optimize this probe again, because post
- * handler may have been changed.
- */
- optimize_kprobe(ap);
- }
+ optimize_kprobe(ap);
return 0;
-disarmed:
- hlist_del_rcu(&ap->hlist);
- return 0;
}
static void __unregister_kprobe_bottom(struct kprobe *p)
@@ -1858,12 +1826,11 @@ void unregister_kprobes(struct kprobe **kps, int num)
if (num <= 0)
return;
- mutex_lock(&kprobe_mutex);
- for (i = 0; i < num; i++)
- if (__unregister_kprobe_top(kps[i]) < 0)
- kps[i]->addr = NULL;
- mutex_unlock(&kprobe_mutex);
-
+ scoped_guard(mutex, &kprobe_mutex) {
+ for (i = 0; i < num; i++)
+ if (__unregister_kprobe_top(kps[i]) < 0)
+ kps[i]->addr = NULL;
+ }
synchronize_rcu();
for (i = 0; i < num; i++)
if (kps[i]->addr)
@@ -2302,8 +2269,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
if (num <= 0)
return;
- mutex_lock(&kprobe_mutex);
for (i = 0; i < num; i++) {
+ guard(mutex)(&kprobe_mutex);
+
if (__unregister_kprobe_top(&rps[i]->kp) < 0)
rps[i]->kp.addr = NULL;
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
@@ -2312,7 +2280,6 @@ void unregister_kretprobes(struct kretprobe **rps, int num)
rcu_assign_pointer(rps[i]->rph->rp, NULL);
#endif
}
- mutex_unlock(&kprobe_mutex);
synchronize_rcu();
for (i = 0; i < num; i++) {
@@ -2393,18 +2360,14 @@ static void kill_kprobe(struct kprobe *p)
/* Disable one kprobe */
int disable_kprobe(struct kprobe *kp)
{
- int ret = 0;
struct kprobe *p;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Disable this kprobe */
p = __disable_kprobe(kp);
- if (IS_ERR(p))
- ret = PTR_ERR(p);
- mutex_unlock(&kprobe_mutex);
- return ret;
+ return IS_ERR(p) ? PTR_ERR(p) : 0;
}
EXPORT_SYMBOL_GPL(disable_kprobe);
@@ -2414,20 +2377,16 @@ int enable_kprobe(struct kprobe *kp)
int ret = 0;
struct kprobe *p;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Check whether specified probe is valid. */
p = __get_valid_kprobe(kp);
- if (unlikely(p == NULL)) {
- ret = -EINVAL;
- goto out;
- }
+ if (unlikely(p == NULL))
+ return -EINVAL;
- if (kprobe_gone(kp)) {
+ if (kprobe_gone(kp))
/* This kprobe has gone, we couldn't enable it. */
- ret = -EINVAL;
- goto out;
- }
+ return -EINVAL;
if (p != kp)
kp->flags &= ~KPROBE_FLAG_DISABLED;
@@ -2441,8 +2400,6 @@ int enable_kprobe(struct kprobe *kp)
kp->flags |= KPROBE_FLAG_DISABLED;
}
}
-out:
- mutex_unlock(&kprobe_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(enable_kprobe);
@@ -2630,11 +2587,11 @@ static int kprobes_module_callback(struct notifier_block *nb,
unsigned int i;
int checkcore = (val == MODULE_STATE_GOING);
- if (val == MODULE_STATE_COMING) {
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
+
+ if (val == MODULE_STATE_COMING)
add_module_kprobe_blacklist(mod);
- mutex_unlock(&kprobe_mutex);
- }
+
if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE)
return NOTIFY_DONE;
@@ -2644,7 +2601,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
* notified, only '.init.text' section would be freed. We need to
* disable kprobes which have been inserted in the sections.
*/
- mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry(p, head, hlist)
@@ -2667,7 +2623,6 @@ static int kprobes_module_callback(struct notifier_block *nb,
}
if (val == MODULE_STATE_GOING)
remove_module_kprobe_blacklist(mod);
- mutex_unlock(&kprobe_mutex);
return NOTIFY_DONE;
}
@@ -2695,7 +2650,7 @@ void kprobe_free_init_mem(void)
struct kprobe *p;
int i;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* Kill all kprobes on initmem because the target code has been freed. */
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -2705,8 +2660,6 @@ void kprobe_free_init_mem(void)
kill_kprobe(p);
}
}
-
- mutex_unlock(&kprobe_mutex);
}
static int __init init_kprobes(void)
@@ -2902,11 +2855,11 @@ static int arm_all_kprobes(void)
unsigned int i, total = 0, errors = 0;
int err, ret = 0;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If kprobes are armed, just return */
if (!kprobes_all_disarmed)
- goto already_enabled;
+ return 0;
/*
* optimize_kprobe() called by arm_kprobe() checks
@@ -2936,8 +2889,6 @@ static int arm_all_kprobes(void)
else
pr_info("Kprobes globally enabled\n");
-already_enabled:
- mutex_unlock(&kprobe_mutex);
return ret;
}
@@ -2948,13 +2899,11 @@ static int disarm_all_kprobes(void)
unsigned int i, total = 0, errors = 0;
int err, ret = 0;
- mutex_lock(&kprobe_mutex);
+ guard(mutex)(&kprobe_mutex);
/* If kprobes are already disarmed, just return */
- if (kprobes_all_disarmed) {
- mutex_unlock(&kprobe_mutex);
+ if (kprobes_all_disarmed)
return 0;
- }
kprobes_all_disarmed = true;
@@ -2979,11 +2928,8 @@ static int disarm_all_kprobes(void)
else
pr_info("Kprobes globally disabled\n");
- mutex_unlock(&kprobe_mutex);
-
/* Wait for disarming all kprobes by optimizer */
- wait_for_kprobe_optimizer();
-
+ wait_for_kprobe_optimizer_locked();
return ret;
}
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1bab21b4718ff..eefb67d9883c2 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -239,21 +239,7 @@ extern const void __start_notes;
extern const void __stop_notes;
#define notes_size (&__stop_notes - &__start_notes)
-static ssize_t notes_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
- char *buf, loff_t off, size_t count)
-{
- memcpy(buf, &__start_notes + off, count);
- return count;
-}
-
-static struct bin_attribute notes_attr __ro_after_init = {
- .attr = {
- .name = "notes",
- .mode = S_IRUGO,
- },
- .read = &notes_read,
-};
+static __ro_after_init BIN_ATTR_SIMPLE_RO(notes);
struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
@@ -307,8 +293,9 @@ static int __init ksysfs_init(void)
goto kset_exit;
if (notes_size > 0) {
- notes_attr.size = notes_size;
- error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
+ bin_attr_notes.private = (void *)&__start_notes;
+ bin_attr_notes.size = notes_size;
+ error = sysfs_create_bin_file(kernel_kobj, &bin_attr_notes);
if (error)
goto group_exit;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index a5ac612b16092..5dc5b0d7238e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -35,6 +35,9 @@ static DEFINE_SPINLOCK(kthread_create_lock);
static LIST_HEAD(kthread_create_list);
struct task_struct *kthreadd_task;
+static LIST_HEAD(kthreads_hotplug);
+static DEFINE_MUTEX(kthreads_hotplug_lock);
+
struct kthread_create_info
{
/* Information passed to kthread() from kthreadd. */
@@ -53,6 +56,8 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
+ unsigned int node;
+ int started;
int result;
int (*threadfn)(void *);
void *data;
@@ -63,6 +68,9 @@ struct kthread {
#endif
/* To store the full name if task comm is truncated. */
char *full_name;
+ struct task_struct *task;
+ struct list_head hotplug_node;
+ struct cpumask *preferred_affinity;
};
enum KTHREAD_BITS {
@@ -121,8 +129,11 @@ bool set_kthread_struct(struct task_struct *p)
init_completion(&kthread->exited);
init_completion(&kthread->parked);
+ INIT_LIST_HEAD(&kthread->hotplug_node);
p->vfork_done = &kthread->exited;
+ kthread->task = p;
+ kthread->node = tsk_fork_get_node(current);
p->worker_private = kthread;
return true;
}
@@ -313,6 +324,16 @@ void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
+ if (!list_empty(&kthread->hotplug_node)) {
+ mutex_lock(&kthreads_hotplug_lock);
+ list_del(&kthread->hotplug_node);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ if (kthread->preferred_affinity) {
+ kfree(kthread->preferred_affinity);
+ kthread->preferred_affinity = NULL;
+ }
+ }
do_exit(0);
}
EXPORT_SYMBOL(kthread_exit);
@@ -338,6 +359,56 @@ void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
}
EXPORT_SYMBOL(kthread_complete_and_exit);
+static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask)
+{
+ const struct cpumask *pref;
+
+ if (kthread->preferred_affinity) {
+ pref = kthread->preferred_affinity;
+ } else {
+ if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE))
+ return;
+ pref = cpumask_of_node(kthread->node);
+ }
+
+ cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD));
+ if (cpumask_empty(cpumask))
+ cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD));
+}
+
+static void kthread_affine_node(void)
+{
+ struct kthread *kthread = to_kthread(current);
+ cpumask_var_t affinity;
+
+ WARN_ON_ONCE(kthread_is_per_cpu(current));
+
+ if (kthread->node == NUMA_NO_NODE) {
+ housekeeping_affine(current, HK_TYPE_KTHREAD);
+ } else {
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) {
+ WARN_ON_ONCE(1);
+ return;
+ }
+
+ mutex_lock(&kthreads_hotplug_lock);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ /*
+ * The node cpumask is racy when read from kthread() but:
+ * - a racing CPU going down will either fail on the subsequent
+ * call to set_cpus_allowed_ptr() or be migrated to housekeepers
+ * afterwards by the scheduler.
+ * - a racing CPU going up will be handled by kthreads_online_cpu()
+ */
+ kthread_fetch_affinity(kthread, affinity);
+ set_cpus_allowed_ptr(current, affinity);
+ mutex_unlock(&kthreads_hotplug_lock);
+
+ free_cpumask_var(affinity);
+ }
+}
+
static int kthread(void *_create)
{
static const struct sched_param param = { .sched_priority = 0 };
@@ -368,7 +439,6 @@ static int kthread(void *_create)
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
- set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@@ -382,6 +452,11 @@ static int kthread(void *_create)
schedule_preempt_disabled();
preempt_enable();
+ self->started = 1;
+
+ if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity)
+ kthread_affine_node();
+
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
cgroup_kthread_ready();
@@ -540,7 +615,9 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int
void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
{
+ struct kthread *kthread = to_kthread(p);
__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
+ WARN_ON_ONCE(kthread->started);
}
/**
@@ -554,7 +631,9 @@ void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
+ struct kthread *kthread = to_kthread(p);
__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
+ WARN_ON_ONCE(kthread->started);
}
EXPORT_SYMBOL(kthread_bind);
@@ -738,10 +817,11 @@ EXPORT_SYMBOL(kthread_stop_put);
int kthreadd(void *unused)
{
+ static const char comm[TASK_COMM_LEN] = "kthreadd";
struct task_struct *tsk = current;
/* Setup a clean context for our children to inherit. */
- set_task_comm(tsk, "kthreadd");
+ set_task_comm(tsk, comm);
ignore_signals(tsk);
set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD));
set_mems_allowed(node_states[N_MEMORY]);
@@ -774,6 +854,92 @@ int kthreadd(void *unused)
return 0;
}
+int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask)
+{
+ struct kthread *kthread = to_kthread(p);
+ cpumask_var_t affinity;
+ unsigned long flags;
+ int ret = 0;
+
+ if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ WARN_ON_ONCE(kthread->preferred_affinity);
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL);
+ if (!kthread->preferred_affinity) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ mutex_lock(&kthreads_hotplug_lock);
+ cpumask_copy(kthread->preferred_affinity, mask);
+ WARN_ON_ONCE(!list_empty(&kthread->hotplug_node));
+ list_add_tail(&kthread->hotplug_node, &kthreads_hotplug);
+ kthread_fetch_affinity(kthread, affinity);
+
+ /* It's safe because the task is inactive. */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ do_set_cpus_allowed(p, affinity);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ mutex_unlock(&kthreads_hotplug_lock);
+out:
+ free_cpumask_var(affinity);
+
+ return ret;
+}
+
+/*
+ * Re-affine kthreads according to their preferences
+ * and the newly online CPU. The CPU down part is handled
+ * by select_fallback_rq() which default re-affines to
+ * housekeepers from other nodes in case the preferred
+ * affinity doesn't apply anymore.
+ */
+static int kthreads_online_cpu(unsigned int cpu)
+{
+ cpumask_var_t affinity;
+ struct kthread *k;
+ int ret;
+
+ guard(mutex)(&kthreads_hotplug_lock);
+
+ if (list_empty(&kthreads_hotplug))
+ return 0;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = 0;
+
+ list_for_each_entry(k, &kthreads_hotplug, hotplug_node) {
+ if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) ||
+ kthread_is_per_cpu(k->task))) {
+ ret = -EINVAL;
+ continue;
+ }
+ kthread_fetch_affinity(k, affinity);
+ set_cpus_allowed_ptr(k->task, affinity);
+ }
+
+ free_cpumask_var(affinity);
+
+ return ret;
+}
+
+static int kthreads_init(void)
+{
+ return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online",
+ kthreads_online_cpu, NULL);
+}
+early_initcall(kthreads_init);
+
void __kthread_init_worker(struct kthread_worker *worker,
const char *name,
struct lock_class_key *key)
@@ -865,12 +1031,11 @@ repeat:
EXPORT_SYMBOL_GPL(kthread_worker_fn);
static __printf(3, 0) struct kthread_worker *
-__kthread_create_worker(int cpu, unsigned int flags,
- const char namefmt[], va_list args)
+__kthread_create_worker_on_node(unsigned int flags, int node,
+ const char namefmt[], va_list args)
{
struct kthread_worker *worker;
struct task_struct *task;
- int node = NUMA_NO_NODE;
worker = kzalloc(sizeof(*worker), GFP_KERNEL);
if (!worker)
@@ -878,20 +1043,14 @@ __kthread_create_worker(int cpu, unsigned int flags,
kthread_init_worker(worker);
- if (cpu >= 0)
- node = cpu_to_node(cpu);
-
task = __kthread_create_on_node(kthread_worker_fn, worker,
- node, namefmt, args);
+ node, namefmt, args);
if (IS_ERR(task))
goto fail_task;
- if (cpu >= 0)
- kthread_bind(task, cpu);
-
worker->flags = flags;
worker->task = task;
- wake_up_process(task);
+
return worker;
fail_task:
@@ -900,8 +1059,9 @@ fail_task:
}
/**
- * kthread_create_worker - create a kthread worker
+ * kthread_create_worker_on_node - create a kthread worker
* @flags: flags modifying the default behavior of the worker
+ * @node: task structure for the thread is allocated on this node
* @namefmt: printf-style name for the kthread worker (task).
*
* Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM)
@@ -909,25 +1069,26 @@ fail_task:
* when the caller was killed by a fatal signal.
*/
struct kthread_worker *
-kthread_create_worker(unsigned int flags, const char namefmt[], ...)
+kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...)
{
struct kthread_worker *worker;
va_list args;
va_start(args, namefmt);
- worker = __kthread_create_worker(-1, flags, namefmt, args);
+ worker = __kthread_create_worker_on_node(flags, node, namefmt, args);
va_end(args);
return worker;
}
-EXPORT_SYMBOL(kthread_create_worker);
+EXPORT_SYMBOL(kthread_create_worker_on_node);
/**
* kthread_create_worker_on_cpu - create a kthread worker and bind it
* to a given CPU and the associated NUMA node.
* @cpu: CPU number
* @flags: flags modifying the default behavior of the worker
- * @namefmt: printf-style name for the kthread worker (task).
+ * @namefmt: printf-style name for the thread. Format is restricted
+ * to "name.*%u". Code fills in cpu number.
*
* Use a valid CPU number if you want to bind the kthread worker
* to the given CPU and the associated NUMA node.
@@ -959,14 +1120,13 @@ EXPORT_SYMBOL(kthread_create_worker);
*/
struct kthread_worker *
kthread_create_worker_on_cpu(int cpu, unsigned int flags,
- const char namefmt[], ...)
+ const char namefmt[])
{
struct kthread_worker *worker;
- va_list args;
- va_start(args, namefmt);
- worker = __kthread_create_worker(cpu, flags, namefmt, args);
- va_end(args);
+ worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu);
+ if (!IS_ERR(worker))
+ kthread_bind(worker->task, cpu);
return worker;
}
@@ -1015,7 +1175,7 @@ static void kthread_insert_work(struct kthread_worker *worker,
* @work: kthread_work to queue
*
* Queue @work to work processor @task for async execution. @task
- * must have been created with kthread_worker_create(). Returns %true
+ * must have been created with kthread_create_worker(). Returns %true
* if @work was successfully queued, %false if it was already pending.
*
* Reinitialize the work if it needs to be used by another worker.
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 7a75eab9c1799..d4281d1e13a63 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -77,7 +77,7 @@ static int sysctl_latencytop(const struct ctl_table *table, int write, void *buf
return err;
}
-static struct ctl_table latencytop_sysctl[] = {
+static const struct ctl_table latencytop_sysctl[] = {
{
.procname = "latencytop",
.data = &latencytop_enabled,
@@ -158,9 +158,9 @@ account_global_scheduler_latency(struct task_struct *tsk,
/**
* __account_scheduler_latency - record an occurred latency
- * @tsk - the task struct of the task hitting the latency
- * @usecs - the duration of the latency in microseconds
- * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
+ * @tsk: the task struct of the task hitting the latency
+ * @usecs: the duration of the latency in microseconds
+ * @inter: 1 if the sleep was interruptible, 0 if uninterruptible
*
* This function is the main entry point for recording latency entries
* as called by the scheduler.
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index 3c21c31796db0..0cd39954d5a10 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -347,6 +347,7 @@ int klp_apply_section_relocs(struct module *pmod, Elf_Shdr *sechdrs,
* /sys/kernel/livepatch/<patch>/transition
* /sys/kernel/livepatch/<patch>/force
* /sys/kernel/livepatch/<patch>/replace
+ * /sys/kernel/livepatch/<patch>/stack_order
* /sys/kernel/livepatch/<patch>/<object>
* /sys/kernel/livepatch/<patch>/<object>/patched
* /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
@@ -452,15 +453,38 @@ static ssize_t replace_show(struct kobject *kobj,
return sysfs_emit(buf, "%d\n", patch->replace);
}
+static ssize_t stack_order_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct klp_patch *patch, *this_patch;
+ int stack_order = 0;
+
+ this_patch = container_of(kobj, struct klp_patch, kobj);
+
+ mutex_lock(&klp_mutex);
+
+ klp_for_each_patch(patch) {
+ stack_order++;
+ if (patch == this_patch)
+ break;
+ }
+
+ mutex_unlock(&klp_mutex);
+
+ return sysfs_emit(buf, "%d\n", stack_order);
+}
+
static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled);
static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition);
static struct kobj_attribute force_kobj_attr = __ATTR_WO(force);
static struct kobj_attribute replace_kobj_attr = __ATTR_RO(replace);
+static struct kobj_attribute stack_order_kobj_attr = __ATTR_RO(stack_order);
static struct attribute *klp_patch_attrs[] = {
&enabled_kobj_attr.attr,
&transition_kobj_attr.attr,
&force_kobj_attr.attr,
&replace_kobj_attr.attr,
+ &stack_order_kobj_attr.attr,
NULL
};
ATTRIBUTE_GROUPS(klp_patch);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 2d8ec0351ef9b..4470680f02269 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -79,7 +79,7 @@ module_param(lock_stat, int, 0644);
#endif
#ifdef CONFIG_SYSCTL
-static struct ctl_table kern_lockdep_table[] = {
+static const struct ctl_table kern_lockdep_table[] = {
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -157,10 +157,12 @@ static inline void lockdep_unlock(void)
__this_cpu_dec(lockdep_recursion);
}
+#ifdef CONFIG_PROVE_LOCKING
static inline bool lockdep_assert_locked(void)
{
return DEBUG_LOCKS_WARN_ON(__owner != current);
}
+#endif
static struct task_struct *lockdep_selftest_task_struct;
@@ -430,7 +432,7 @@ static inline u16 hlock_id(struct held_lock *hlock)
return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS));
}
-static inline unsigned int chain_hlock_class_idx(u16 hlock_id)
+static inline __maybe_unused unsigned int chain_hlock_class_idx(u16 hlock_id)
{
return hlock_id & (MAX_LOCKDEP_KEYS - 1);
}
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index bbe9000260d02..20f9ef58d3d06 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -119,7 +119,8 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
+#define AVG_LOCKDEP_CHAIN_DEPTH 5
+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS * AVG_LOCKDEP_CHAIN_DEPTH)
extern struct lock_chain lock_chains[];
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index de95ec07e4771..cc33470f4de97 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -106,7 +106,7 @@ static const struct kernel_param_ops lt_bind_ops = {
module_param_cb(bind_readers, &lt_bind_ops, &bind_readers, 0644);
module_param_cb(bind_writers, &lt_bind_ops, &bind_writers, 0644);
-long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask);
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn);
static struct task_struct *stats_task;
static struct task_struct **writer_tasks;
@@ -1358,7 +1358,7 @@ static int __init lock_torture_init(void)
if (torture_init_error(firsterr))
goto unwind;
if (cpumask_nonempty(bind_writers))
- torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers);
+ torture_sched_setaffinity(writer_tasks[i]->pid, bind_writers, true);
create_reader:
if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
@@ -1369,7 +1369,7 @@ static int __init lock_torture_init(void)
if (torture_init_error(firsterr))
goto unwind;
if (cpumask_nonempty(bind_readers))
- torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers);
+ torture_sched_setaffinity(reader_tasks[j]->pid, bind_readers, true);
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 3302e52f0c967..b36f23de48f1b 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -657,10 +657,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
goto err;
}
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- /* Make sure we do wakeups before calling schedule */
- wake_up_q(&wake_q);
- wake_q_init(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
schedule_preempt_disabled();
@@ -710,8 +707,7 @@ skip_wait:
if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
preempt_enable();
return 0;
@@ -720,10 +716,9 @@ err:
__mutex_remove_waiter(lock, &waiter);
err_early_kill:
trace_contention_end(lock, ret);
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);
- wake_up_q(&wake_q);
preempt_enable();
return ret;
}
@@ -935,10 +930,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne
if (owner & MUTEX_FLAG_HANDOFF)
__mutex_handoff(lock, next);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index e858de203eb6f..4a8df1800cbbd 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1292,7 +1292,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
*/
get_task_struct(owner);
- raw_spin_unlock_irq(&lock->wait_lock);
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
@@ -1596,6 +1596,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock,
* or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
+ * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock
*
* Must be called with lock->wait_lock held and interrupts disabled
*/
@@ -1603,7 +1604,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
struct ww_acquire_ctx *ww_ctx,
unsigned int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter,
+ struct wake_q_head *wake_q)
__releases(&lock->wait_lock) __acquires(&lock->wait_lock)
{
struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex);
@@ -1634,7 +1636,7 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
- raw_spin_unlock_irq(&lock->wait_lock);
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
rt_mutex_schedule();
@@ -1708,7 +1710,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q);
if (likely(!ret))
- ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
+ ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q);
if (likely(!ret)) {
/* acquired the lock */
@@ -1785,10 +1787,7 @@ static int __sched rt_mutex_slowlock(struct rt_mutex_base *lock,
*/
raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
rt_mutex_post_schedule();
return ret;
@@ -1846,11 +1845,7 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock,
owner = rt_mutex_owner(lock);
else
owner = NULL;
- preempt_disable();
- raw_spin_unlock_irq(&lock->wait_lock);
- wake_up_q(wake_q);
- wake_q_init(wake_q);
- preempt_enable();
+ raw_spin_unlock_irq_wake(&lock->wait_lock, wake_q);
if (!owner || !rtmutex_spin_on_owner(lock, &waiter, owner))
schedule_rtlock();
@@ -1879,10 +1874,7 @@ static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
raw_spin_lock_irqsave(&lock->wait_lock, flags);
rtlock_slowlock_locked(lock, &wake_q);
- preempt_disable();
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- preempt_enable();
+ raw_spin_unlock_irqrestore_wake(&lock->wait_lock, flags, &wake_q);
}
#endif /* RT_MUTEX_BUILD_SPINLOCKS */
diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c
index 33ea31d6a7b3b..191e4720e5466 100644
--- a/kernel/locking/rtmutex_api.c
+++ b/kernel/locking/rtmutex_api.c
@@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock,
raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
set_current_state(TASK_INTERRUPTIBLE);
- ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter);
+ ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL);
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 5d58b2c0ef98b..bcb1b9fea5880 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -404,7 +404,7 @@ static inline u32 prandom_u32_below(u32 ceil)
static int *get_random_order(int count)
{
int *order;
- int n, r, tmp;
+ int n, r;
order = kmalloc_array(count, sizeof(*order), GFP_KERNEL);
if (!order)
@@ -415,11 +415,8 @@ static int *get_random_order(int count)
for (n = count - 1; n > 1; n--) {
r = prandom_u32_below(n + 1);
- if (r != n) {
- tmp = order[n];
- order[n] = order[r];
- order[r] = tmp;
- }
+ if (r != n)
+ swap(order[n], order[r]);
}
return order;
diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig
index 7b329057997ad..d7762ef5949a2 100644
--- a/kernel/module/Kconfig
+++ b/kernel/module/Kconfig
@@ -169,6 +169,36 @@ config MODVERSIONS
make them incompatible with the kernel you are running. If
unsure, say N.
+choice
+ prompt "Module versioning implementation"
+ depends on MODVERSIONS
+ help
+ Select the tool used to calculate symbol versions for modules.
+
+ If unsure, select GENKSYMS.
+
+config GENKSYMS
+ bool "genksyms (from source code)"
+ help
+ Calculate symbol versions from pre-processed source code using
+ genksyms.
+
+ If unsure, say Y.
+
+config GENDWARFKSYMS
+ bool "gendwarfksyms (from debugging information)"
+ depends on DEBUG_INFO
+ # Requires full debugging information, split DWARF not supported.
+ depends on !DEBUG_INFO_REDUCED && !DEBUG_INFO_SPLIT
+ # Requires ELF object files.
+ depends on !LTO
+ help
+ Calculate symbol versions from DWARF debugging information using
+ gendwarfksyms. Requires DEBUG_INFO to be enabled.
+
+ If unsure, say N.
+endchoice
+
config ASM_MODVERSIONS
bool
default HAVE_ASM_MODVERSIONS && MODVERSIONS
@@ -177,6 +207,31 @@ config ASM_MODVERSIONS
assembly. This can be enabled only when the target architecture
supports it.
+config EXTENDED_MODVERSIONS
+ bool "Extended Module Versioning Support"
+ depends on MODVERSIONS
+ help
+ This enables extended MODVERSIONs support, allowing long symbol
+ names to be versioned.
+
+ The most likely reason you would enable this is to enable Rust
+ support. If unsure, say N.
+
+config BASIC_MODVERSIONS
+ bool "Basic Module Versioning Support"
+ depends on MODVERSIONS
+ default y
+ help
+ This enables basic MODVERSIONS support, allowing older tools or
+ kernels to potentially load modules.
+
+ Disabling this may cause older `modprobe` or `kmod` to be unable
+ to read MODVERSIONS information from built modules. With this
+ disabled, older kernels may treat this module as unversioned.
+
+ This is enabled by default when MODVERSIONS are enabled.
+ If unsure, say Y.
+
config MODULE_SRCVERSION_ALL
bool "Source checksum for all modules"
help
@@ -231,6 +286,7 @@ comment "Do not forget to sign required modules with scripts/sign-file"
choice
prompt "Hash algorithm to sign modules"
depends on MODULE_SIG || IMA_APPRAISE_MODSIG
+ default MODULE_SIG_SHA512
help
This determines which sort of hashing algorithm will be used during
signature generation. This algorithm _must_ be built into the kernel
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index daef2be839022..d09b46ef032f0 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -47,16 +47,16 @@ struct kernel_symbol {
extern struct mutex module_mutex;
extern struct list_head modules;
-extern struct module_attribute *modinfo_attrs[];
-extern size_t modinfo_attrs_count;
+extern const struct module_attribute *const modinfo_attrs[];
+extern const size_t modinfo_attrs_count;
/* Provided by the linker */
extern const struct kernel_symbol __start___ksymtab[];
extern const struct kernel_symbol __stop___ksymtab[];
extern const struct kernel_symbol __start___ksymtab_gpl[];
extern const struct kernel_symbol __stop___ksymtab_gpl[];
-extern const s32 __start___kcrctab[];
-extern const s32 __start___kcrctab_gpl[];
+extern const u32 __start___kcrctab[];
+extern const u32 __start___kcrctab_gpl[];
struct load_info {
const char *name;
@@ -86,6 +86,8 @@ struct load_info {
unsigned int vers;
unsigned int info;
unsigned int pcpu;
+ unsigned int vers_ext_crc;
+ unsigned int vers_ext_name;
} index;
};
@@ -102,7 +104,7 @@ struct find_symbol_arg {
/* Output */
struct module *owner;
- const s32 *crc;
+ const u32 *crc;
const struct kernel_symbol *sym;
enum mod_license license;
};
@@ -327,7 +329,8 @@ static inline struct module *mod_find(unsigned long addr, struct mod_tree_root *
}
#endif /* CONFIG_MODULES_TREE_LOOKUP */
-int module_enable_rodata_ro(const struct module *mod, bool after_init);
+int module_enable_rodata_ro(const struct module *mod);
+int module_enable_rodata_ro_after_init(const struct module *mod);
int module_enable_data_nx(const struct module *mod);
int module_enable_text_rox(const struct module *mod);
int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs,
@@ -384,16 +387,25 @@ static inline void init_param_lock(struct module *mod) { }
#ifdef CONFIG_MODVERSIONS
int check_version(const struct load_info *info,
- const char *symname, struct module *mod, const s32 *crc);
+ const char *symname, struct module *mod, const u32 *crc);
void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp,
struct kernel_symbol *ks, struct tracepoint * const *tp);
int check_modstruct_version(const struct load_info *info, struct module *mod);
int same_magic(const char *amagic, const char *bmagic, bool has_crcs);
+struct modversion_info_ext {
+ size_t remaining;
+ const u32 *crc;
+ const char *name;
+};
+void modversion_ext_start(const struct load_info *info, struct modversion_info_ext *ver);
+void modversion_ext_advance(struct modversion_info_ext *ver);
+#define for_each_modversion_info_ext(ver, info) \
+ for (modversion_ext_start(info, &ver); ver.remaining > 0; modversion_ext_advance(&ver))
#else /* !CONFIG_MODVERSIONS */
static inline int check_version(const struct load_info *info,
const char *symname,
struct module *mod,
- const s32 *crc)
+ const u32 *crc)
{
return 1;
}
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 5399c182b3cbe..1fb9ad289a6f8 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -86,7 +86,7 @@ struct mod_tree_root mod_tree __cacheline_aligned = {
struct symsearch {
const struct kernel_symbol *start, *stop;
- const s32 *crcs;
+ const u32 *crcs;
enum mod_license license;
};
@@ -538,7 +538,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \
{ \
mod->field = kstrdup(s, GFP_KERNEL); \
} \
-static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
+static ssize_t show_modinfo_##field(const struct module_attribute *mattr, \
struct module_kobject *mk, char *buffer) \
{ \
return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \
@@ -552,7 +552,7 @@ static void free_modinfo_##field(struct module *mod) \
kfree(mod->field); \
mod->field = NULL; \
} \
-static struct module_attribute modinfo_##field = { \
+static const struct module_attribute modinfo_##field = { \
.attr = { .name = __stringify(field), .mode = 0444 }, \
.show = show_modinfo_##field, \
.setup = setup_modinfo_##field, \
@@ -842,13 +842,13 @@ void symbol_put_addr(void *addr)
}
EXPORT_SYMBOL_GPL(symbol_put_addr);
-static ssize_t show_refcnt(struct module_attribute *mattr,
+static ssize_t show_refcnt(const struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
return sprintf(buffer, "%i\n", module_refcount(mk->mod));
}
-static struct module_attribute modinfo_refcnt =
+static const struct module_attribute modinfo_refcnt =
__ATTR(refcnt, 0444, show_refcnt, NULL);
void __module_get(struct module *module)
@@ -917,7 +917,7 @@ size_t module_flags_taint(unsigned long taints, char *buf)
return l;
}
-static ssize_t show_initstate(struct module_attribute *mattr,
+static ssize_t show_initstate(const struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
const char *state = "unknown";
@@ -938,10 +938,10 @@ static ssize_t show_initstate(struct module_attribute *mattr,
return sprintf(buffer, "%s\n", state);
}
-static struct module_attribute modinfo_initstate =
+static const struct module_attribute modinfo_initstate =
__ATTR(initstate, 0444, show_initstate, NULL);
-static ssize_t store_uevent(struct module_attribute *mattr,
+static ssize_t store_uevent(const struct module_attribute *mattr,
struct module_kobject *mk,
const char *buffer, size_t count)
{
@@ -951,10 +951,10 @@ static ssize_t store_uevent(struct module_attribute *mattr,
return rc ? rc : count;
}
-struct module_attribute module_uevent =
+const struct module_attribute module_uevent =
__ATTR(uevent, 0200, NULL, store_uevent);
-static ssize_t show_coresize(struct module_attribute *mattr,
+static ssize_t show_coresize(const struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
unsigned int size = mk->mod->mem[MOD_TEXT].size;
@@ -966,11 +966,11 @@ static ssize_t show_coresize(struct module_attribute *mattr,
return sprintf(buffer, "%u\n", size);
}
-static struct module_attribute modinfo_coresize =
+static const struct module_attribute modinfo_coresize =
__ATTR(coresize, 0444, show_coresize, NULL);
#ifdef CONFIG_ARCH_WANTS_MODULES_DATA_IN_VMALLOC
-static ssize_t show_datasize(struct module_attribute *mattr,
+static ssize_t show_datasize(const struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
unsigned int size = 0;
@@ -980,11 +980,11 @@ static ssize_t show_datasize(struct module_attribute *mattr,
return sprintf(buffer, "%u\n", size);
}
-static struct module_attribute modinfo_datasize =
+static const struct module_attribute modinfo_datasize =
__ATTR(datasize, 0444, show_datasize, NULL);
#endif
-static ssize_t show_initsize(struct module_attribute *mattr,
+static ssize_t show_initsize(const struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
unsigned int size = 0;
@@ -994,10 +994,10 @@ static ssize_t show_initsize(struct module_attribute *mattr,
return sprintf(buffer, "%u\n", size);
}
-static struct module_attribute modinfo_initsize =
+static const struct module_attribute modinfo_initsize =
__ATTR(initsize, 0444, show_initsize, NULL);
-static ssize_t show_taint(struct module_attribute *mattr,
+static ssize_t show_taint(const struct module_attribute *mattr,
struct module_kobject *mk, char *buffer)
{
size_t l;
@@ -1007,10 +1007,10 @@ static ssize_t show_taint(struct module_attribute *mattr,
return l;
}
-static struct module_attribute modinfo_taint =
+static const struct module_attribute modinfo_taint =
__ATTR(taint, 0444, show_taint, NULL);
-struct module_attribute *modinfo_attrs[] = {
+const struct module_attribute *const modinfo_attrs[] = {
&module_uevent,
&modinfo_version,
&modinfo_srcversion,
@@ -1027,7 +1027,7 @@ struct module_attribute *modinfo_attrs[] = {
NULL,
};
-size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
+const size_t modinfo_attrs_count = ARRAY_SIZE(modinfo_attrs);
static const char vermagic[] = VERMAGIC_STRING;
@@ -1681,7 +1681,7 @@ static void module_license_taint_check(struct module *mod, const char *license)
static void setup_modinfo(struct module *mod, struct load_info *info)
{
- struct module_attribute *attr;
+ const struct module_attribute *attr;
int i;
for (i = 0; (attr = modinfo_attrs[i]); i++) {
@@ -1692,7 +1692,7 @@ static void setup_modinfo(struct module *mod, struct load_info *info)
static void free_modinfo(struct module *mod)
{
- struct module_attribute *attr;
+ const struct module_attribute *attr;
int i;
for (i = 0; (attr = modinfo_attrs[i]); i++) {
@@ -2074,6 +2074,82 @@ static int elf_validity_cache_index_str(struct load_info *info)
}
/**
+ * elf_validity_cache_index_versions() - Validate and cache version indices
+ * @info: Load info to cache version indices in.
+ * Must have &load_info->sechdrs and &load_info->secstrings populated.
+ * @flags: Load flags, relevant to suppress version loading, see
+ * uapi/linux/module.h
+ *
+ * If we're ignoring modversions based on @flags, zero all version indices
+ * and return validity. Othewrise check:
+ *
+ * * If "__version_ext_crcs" is present, "__version_ext_names" is present
+ * * There is a name present for every crc
+ *
+ * Then populate:
+ *
+ * * &load_info->index.vers
+ * * &load_info->index.vers_ext_crc
+ * * &load_info->index.vers_ext_names
+ *
+ * if present.
+ *
+ * Return: %0 if valid, %-ENOEXEC on failure.
+ */
+static int elf_validity_cache_index_versions(struct load_info *info, int flags)
+{
+ unsigned int vers_ext_crc;
+ unsigned int vers_ext_name;
+ size_t crc_count;
+ size_t remaining_len;
+ size_t name_size;
+ char *name;
+
+ /* If modversions were suppressed, pretend we didn't find any */
+ if (flags & MODULE_INIT_IGNORE_MODVERSIONS) {
+ info->index.vers = 0;
+ info->index.vers_ext_crc = 0;
+ info->index.vers_ext_name = 0;
+ return 0;
+ }
+
+ vers_ext_crc = find_sec(info, "__version_ext_crcs");
+ vers_ext_name = find_sec(info, "__version_ext_names");
+
+ /* If we have one field, we must have the other */
+ if (!!vers_ext_crc != !!vers_ext_name) {
+ pr_err("extended version crc+name presence does not match");
+ return -ENOEXEC;
+ }
+
+ /*
+ * If we have extended version information, we should have the same
+ * number of entries in every section.
+ */
+ if (vers_ext_crc) {
+ crc_count = info->sechdrs[vers_ext_crc].sh_size / sizeof(u32);
+ name = (void *)info->hdr +
+ info->sechdrs[vers_ext_name].sh_offset;
+ remaining_len = info->sechdrs[vers_ext_name].sh_size;
+
+ while (crc_count--) {
+ name_size = strnlen(name, remaining_len) + 1;
+ if (name_size > remaining_len) {
+ pr_err("more extended version crcs than names");
+ return -ENOEXEC;
+ }
+ remaining_len -= name_size;
+ name += name_size;
+ }
+ }
+
+ info->index.vers = find_sec(info, "__versions");
+ info->index.vers_ext_crc = vers_ext_crc;
+ info->index.vers_ext_name = vers_ext_name;
+ return 0;
+}
+
+/**
* elf_validity_cache_index() - Resolve, validate, cache section indices
* @info: Load info to read from and update.
* &load_info->sechdrs and &load_info->secstrings must be populated.
@@ -2087,9 +2163,7 @@ static int elf_validity_cache_index_str(struct load_info *info)
* * elf_validity_cache_index_mod()
* * elf_validity_cache_index_sym()
* * elf_validity_cache_index_str()
- *
- * If versioning is not suppressed via flags, load the version index from
- * a section called "__versions" with no validation.
+ * * elf_validity_cache_index_versions()
*
* If CONFIG_SMP is enabled, load the percpu section by name with no
* validation.
@@ -2112,11 +2186,9 @@ static int elf_validity_cache_index(struct load_info *info, int flags)
err = elf_validity_cache_index_str(info);
if (err < 0)
return err;
-
- if (flags & MODULE_INIT_IGNORE_MODVERSIONS)
- info->index.vers = 0; /* Pretend no __versions section! */
- else
- info->index.vers = find_sec(info, "__versions");
+ err = elf_validity_cache_index_versions(info, flags);
+ if (err < 0)
+ return err;
info->index.pcpu = find_pcpusec(info);
@@ -2327,16 +2399,29 @@ static int rewrite_section_headers(struct load_info *info, int flags)
/* Track but don't keep modinfo and version sections. */
info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers_ext_crc].sh_flags &=
+ ~(unsigned long)SHF_ALLOC;
+ info->sechdrs[info->index.vers_ext_name].sh_flags &=
+ ~(unsigned long)SHF_ALLOC;
info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
return 0;
}
+static const char *const module_license_offenders[] = {
+ /* driverloader was caught wrongly pretending to be under GPL */
+ "driverloader",
+
+ /* lve claims to be GPL but upstream won't provide source */
+ "lve",
+};
+
/*
* These calls taint the kernel depending certain module circumstances */
static void module_augment_kernel_taints(struct module *mod, struct load_info *info)
{
int prev_taint = test_taint(TAINT_PROPRIETARY_MODULE);
+ size_t i;
if (!get_modinfo(info, "intree")) {
if (!test_taint(TAINT_OOT_MODULE))
@@ -2385,15 +2470,11 @@ static void module_augment_kernel_taints(struct module *mod, struct load_info *i
if (strcmp(mod->name, "ndiswrapper") == 0)
add_taint(TAINT_PROPRIETARY_MODULE, LOCKDEP_NOW_UNRELIABLE);
- /* driverloader was caught wrongly pretending to be under GPL */
- if (strcmp(mod->name, "driverloader") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
-
- /* lve claims to be GPL but upstream won't provide source */
- if (strcmp(mod->name, "lve") == 0)
- add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
- LOCKDEP_NOW_UNRELIABLE);
+ for (i = 0; i < ARRAY_SIZE(module_license_offenders); ++i) {
+ if (strcmp(mod->name, module_license_offenders[i]) == 0)
+ add_taint_module(mod, TAINT_PROPRIETARY_MODULE,
+ LOCKDEP_NOW_UNRELIABLE);
+ }
if (!prev_taint && test_taint(TAINT_PROPRIETARY_MODULE))
pr_warn("%s: module license taints kernel.\n", mod->name);
@@ -2948,9 +3029,12 @@ static noinline int do_init_module(struct module *mod)
/* Switch to core kallsyms now init is done: kallsyms may be walking! */
rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
- ret = module_enable_rodata_ro(mod, true);
+ ret = module_enable_rodata_ro_after_init(mod);
if (ret)
- goto fail_mutex_unlock;
+ pr_warn("%s: module_enable_rodata_ro_after_init() returned %d, "
+ "ro_after_init data might still be writable\n",
+ mod->name, ret);
+
mod_tree_remove_init(mod);
module_arch_freeing_init(mod);
for_class_mod_mem_type(type, init) {
@@ -2989,8 +3073,6 @@ static noinline int do_init_module(struct module *mod)
return 0;
-fail_mutex_unlock:
- mutex_unlock(&module_mutex);
fail_free_freeinit:
kfree(freeinit);
fail:
@@ -3118,7 +3200,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
module_bug_finalize(info->hdr, info->sechdrs, mod);
module_cfi_finalize(info->hdr, info->sechdrs, mod);
- err = module_enable_rodata_ro(mod, false);
+ err = module_enable_rodata_ro(mod);
if (err)
goto out_strict_rwx;
err = module_enable_data_nx(mod);
diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c
index 239e5013359d9..74834ba15615f 100644
--- a/kernel/module/strict_rwx.c
+++ b/kernel/module/strict_rwx.c
@@ -47,7 +47,7 @@ int module_enable_text_rox(const struct module *mod)
return 0;
}
-int module_enable_rodata_ro(const struct module *mod, bool after_init)
+int module_enable_rodata_ro(const struct module *mod)
{
int ret;
@@ -61,12 +61,17 @@ int module_enable_rodata_ro(const struct module *mod, bool after_init)
if (ret)
return ret;
- if (after_init)
- return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
-
return 0;
}
+int module_enable_rodata_ro_after_init(const struct module *mod)
+{
+ if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX) || !rodata_enabled)
+ return 0;
+
+ return module_set_memory(mod, MOD_RO_AFTER_INIT, set_memory_ro);
+}
+
int module_enable_data_nx(const struct module *mod)
{
if (!IS_ENABLED(CONFIG_STRICT_MODULE_RWX))
diff --git a/kernel/module/sysfs.c b/kernel/module/sysfs.c
index 456358e1fdc43..b401ff4b02d29 100644
--- a/kernel/module/sysfs.c
+++ b/kernel/module/sysfs.c
@@ -19,24 +19,16 @@
* J. Corbet <corbet@lwn.net>
*/
#ifdef CONFIG_KALLSYMS
-struct module_sect_attr {
- struct bin_attribute battr;
- unsigned long address;
-};
-
struct module_sect_attrs {
struct attribute_group grp;
- unsigned int nsections;
- struct module_sect_attr attrs[];
+ struct bin_attribute attrs[];
};
#define MODULE_SECT_READ_SIZE (3 /* "0x", "\n" */ + (BITS_PER_LONG / 4))
static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
- struct bin_attribute *battr,
+ const struct bin_attribute *battr,
char *buf, loff_t pos, size_t count)
{
- struct module_sect_attr *sattr =
- container_of(battr, struct module_sect_attr, battr);
char bounce[MODULE_SECT_READ_SIZE + 1];
size_t wrote;
@@ -53,7 +45,7 @@ static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
*/
wrote = scnprintf(bounce, sizeof(bounce), "0x%px\n",
kallsyms_show_value(file->f_cred)
- ? (void *)sattr->address : NULL);
+ ? battr->private : NULL);
count = min(count, wrote);
memcpy(buf, bounce, count);
@@ -62,59 +54,59 @@ static ssize_t module_sect_read(struct file *file, struct kobject *kobj,
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
{
- unsigned int section;
+ const struct bin_attribute *const *bin_attr;
- for (section = 0; section < sect_attrs->nsections; section++)
- kfree(sect_attrs->attrs[section].battr.attr.name);
+ for (bin_attr = sect_attrs->grp.bin_attrs_new; *bin_attr; bin_attr++)
+ kfree((*bin_attr)->attr.name);
+ kfree(sect_attrs->grp.bin_attrs_new);
kfree(sect_attrs);
}
static int add_sect_attrs(struct module *mod, const struct load_info *info)
{
- unsigned int nloaded = 0, i, size[2];
struct module_sect_attrs *sect_attrs;
- struct module_sect_attr *sattr;
- struct bin_attribute **gattr;
+ const struct bin_attribute **gattr;
+ struct bin_attribute *sattr;
+ unsigned int nloaded = 0, i;
int ret;
/* Count loaded sections and allocate structures */
for (i = 0; i < info->hdr->e_shnum; i++)
if (!sect_empty(&info->sechdrs[i]))
nloaded++;
- size[0] = ALIGN(struct_size(sect_attrs, attrs, nloaded),
- sizeof(sect_attrs->grp.bin_attrs[0]));
- size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.bin_attrs[0]);
- sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
+ sect_attrs = kzalloc(struct_size(sect_attrs, attrs, nloaded), GFP_KERNEL);
if (!sect_attrs)
return -ENOMEM;
+ gattr = kcalloc(nloaded + 1, sizeof(*gattr), GFP_KERNEL);
+ if (!gattr) {
+ kfree(sect_attrs);
+ return -ENOMEM;
+ }
+
/* Setup section attributes. */
sect_attrs->grp.name = "sections";
- sect_attrs->grp.bin_attrs = (void *)sect_attrs + size[0];
+ sect_attrs->grp.bin_attrs_new = gattr;
- sect_attrs->nsections = 0;
sattr = &sect_attrs->attrs[0];
- gattr = &sect_attrs->grp.bin_attrs[0];
for (i = 0; i < info->hdr->e_shnum; i++) {
Elf_Shdr *sec = &info->sechdrs[i];
if (sect_empty(sec))
continue;
- sysfs_bin_attr_init(&sattr->battr);
- sattr->address = sec->sh_addr;
- sattr->battr.attr.name =
+ sysfs_bin_attr_init(sattr);
+ sattr->attr.name =
kstrdup(info->secstrings + sec->sh_name, GFP_KERNEL);
- if (!sattr->battr.attr.name) {
+ if (!sattr->attr.name) {
ret = -ENOMEM;
goto out;
}
- sect_attrs->nsections++;
- sattr->battr.read = module_sect_read;
- sattr->battr.size = MODULE_SECT_READ_SIZE;
- sattr->battr.attr.mode = 0400;
- *(gattr++) = &(sattr++)->battr;
+ sattr->read_new = module_sect_read;
+ sattr->private = (void *)sec->sh_addr;
+ sattr->size = MODULE_SECT_READ_SIZE;
+ sattr->attr.mode = 0400;
+ *(gattr++) = sattr++;
}
- *gattr = NULL;
ret = sysfs_create_group(&mod->mkobj.kobj, &sect_attrs->grp);
if (ret)
@@ -146,20 +138,13 @@ static void remove_sect_attrs(struct module *mod)
*/
struct module_notes_attrs {
- struct kobject *dir;
- unsigned int notes;
- struct bin_attribute attrs[] __counted_by(notes);
+ struct attribute_group grp;
+ struct bin_attribute attrs[];
};
-static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
- unsigned int i)
+static void free_notes_attrs(struct module_notes_attrs *notes_attrs)
{
- if (notes_attrs->dir) {
- while (i-- > 0)
- sysfs_remove_bin_file(notes_attrs->dir,
- &notes_attrs->attrs[i]);
- kobject_put(notes_attrs->dir);
- }
+ kfree(notes_attrs->grp.bin_attrs_new);
kfree(notes_attrs);
}
@@ -167,6 +152,7 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info)
{
unsigned int notes, loaded, i;
struct module_notes_attrs *notes_attrs;
+ const struct bin_attribute **gattr;
struct bin_attribute *nattr;
int ret;
@@ -185,47 +171,55 @@ static int add_notes_attrs(struct module *mod, const struct load_info *info)
if (!notes_attrs)
return -ENOMEM;
- notes_attrs->notes = notes;
+ gattr = kcalloc(notes + 1, sizeof(*gattr), GFP_KERNEL);
+ if (!gattr) {
+ kfree(notes_attrs);
+ return -ENOMEM;
+ }
+
+ notes_attrs->grp.name = "notes";
+ notes_attrs->grp.bin_attrs_new = gattr;
+
nattr = &notes_attrs->attrs[0];
for (loaded = i = 0; i < info->hdr->e_shnum; ++i) {
if (sect_empty(&info->sechdrs[i]))
continue;
if (info->sechdrs[i].sh_type == SHT_NOTE) {
sysfs_bin_attr_init(nattr);
- nattr->attr.name = mod->sect_attrs->attrs[loaded].battr.attr.name;
+ nattr->attr.name = mod->sect_attrs->attrs[loaded].attr.name;
nattr->attr.mode = 0444;
nattr->size = info->sechdrs[i].sh_size;
nattr->private = (void *)info->sechdrs[i].sh_addr;
- nattr->read = sysfs_bin_attr_simple_read;
- ++nattr;
+ nattr->read_new = sysfs_bin_attr_simple_read;
+ *(gattr++) = nattr++;
}
++loaded;
}
- notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
- if (!notes_attrs->dir) {
- ret = -ENOMEM;
+ ret = sysfs_create_group(&mod->mkobj.kobj, &notes_attrs->grp);
+ if (ret)
goto out;
- }
-
- for (i = 0; i < notes; ++i) {
- ret = sysfs_create_bin_file(notes_attrs->dir, &notes_attrs->attrs[i]);
- if (ret)
- goto out;
- }
mod->notes_attrs = notes_attrs;
return 0;
out:
- free_notes_attrs(notes_attrs, i);
+ free_notes_attrs(notes_attrs);
return ret;
}
static void remove_notes_attrs(struct module *mod)
{
- if (mod->notes_attrs)
- free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes);
+ if (mod->notes_attrs) {
+ sysfs_remove_group(&mod->mkobj.kobj,
+ &mod->notes_attrs->grp);
+ /*
+ * We are positive that no one is using any notes attrs
+ * at this point. Deallocate immediately.
+ */
+ free_notes_attrs(mod->notes_attrs);
+ mod->notes_attrs = NULL;
+ }
}
#else /* !CONFIG_KALLSYMS */
@@ -275,7 +269,7 @@ static int add_usage_links(struct module *mod)
static void module_remove_modinfo_attrs(struct module *mod, int end)
{
- struct module_attribute *attr;
+ const struct module_attribute *attr;
int i;
for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) {
@@ -293,7 +287,7 @@ static void module_remove_modinfo_attrs(struct module *mod, int end)
static int module_add_modinfo_attrs(struct module *mod)
{
- struct module_attribute *attr;
+ const struct module_attribute *attr;
struct module_attribute *temp_attr;
int error = 0;
int i;
diff --git a/kernel/module/version.c b/kernel/module/version.c
index 53f43ac5a73e9..3718a88683219 100644
--- a/kernel/module/version.c
+++ b/kernel/module/version.c
@@ -13,17 +13,34 @@
int check_version(const struct load_info *info,
const char *symname,
struct module *mod,
- const s32 *crc)
+ const u32 *crc)
{
Elf_Shdr *sechdrs = info->sechdrs;
unsigned int versindex = info->index.vers;
unsigned int i, num_versions;
struct modversion_info *versions;
+ struct modversion_info_ext version_ext;
/* Exporting module didn't supply crcs? OK, we're already tainted. */
if (!crc)
return 1;
+ /* If we have extended version info, rely on it */
+ if (info->index.vers_ext_crc) {
+ for_each_modversion_info_ext(version_ext, info) {
+ if (strcmp(version_ext.name, symname) != 0)
+ continue;
+ if (*version_ext.crc == *crc)
+ return 1;
+ pr_debug("Found checksum %X vs module %X\n",
+ *crc, *version_ext.crc);
+ goto bad_version;
+ }
+ pr_warn_once("%s: no extended symbol version for %s\n",
+ info->name, symname);
+ return 1;
+ }
+
/* No versions at all? modprobe --force does this. */
if (versindex == 0)
return try_to_force_load(mod, symname) == 0;
@@ -87,6 +104,34 @@ int same_magic(const char *amagic, const char *bmagic,
return strcmp(amagic, bmagic) == 0;
}
+void modversion_ext_start(const struct load_info *info,
+ struct modversion_info_ext *start)
+{
+ unsigned int crc_idx = info->index.vers_ext_crc;
+ unsigned int name_idx = info->index.vers_ext_name;
+ Elf_Shdr *sechdrs = info->sechdrs;
+
+ /*
+ * Both of these fields are needed for this to be useful
+ * Any future fields should be initialized to NULL if absent.
+ */
+ if (crc_idx == 0 || name_idx == 0) {
+ start->remaining = 0;
+ return;
+ }
+
+ start->crc = (const u32 *)sechdrs[crc_idx].sh_addr;
+ start->name = (const char *)sechdrs[name_idx].sh_addr;
+ start->remaining = sechdrs[crc_idx].sh_size / sizeof(*start->crc);
+}
+
+void modversion_ext_advance(struct modversion_info_ext *vers)
+{
+ vers->remaining--;
+ vers->crc++;
+ vers->name += strlen(vers->name) + 1;
+}
+
/*
* Generate the signature for all relevant module structures here.
* If these change, we don't want to try to parse the module.
diff --git a/kernel/padata.c b/kernel/padata.c
index d51bbc76b2279..418987056340e 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -47,6 +47,22 @@ struct padata_mt_job_state {
static void padata_free_pd(struct parallel_data *pd);
static void __init padata_mt_helper(struct work_struct *work);
+static inline void padata_get_pd(struct parallel_data *pd)
+{
+ refcount_inc(&pd->refcnt);
+}
+
+static inline void padata_put_pd_cnt(struct parallel_data *pd, int cnt)
+{
+ if (refcount_sub_and_test(cnt, &pd->refcnt))
+ padata_free_pd(pd);
+}
+
+static inline void padata_put_pd(struct parallel_data *pd)
+{
+ padata_put_pd_cnt(pd, 1);
+}
+
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
int cpu, target_cpu;
@@ -206,7 +222,7 @@ int padata_do_parallel(struct padata_shell *ps,
if ((pinst->flags & PADATA_RESET))
goto out;
- refcount_inc(&pd->refcnt);
+ padata_get_pd(pd);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
@@ -336,8 +352,14 @@ static void padata_reorder(struct parallel_data *pd)
smp_mb();
reorder = per_cpu_ptr(pd->reorder_list, pd->cpu);
- if (!list_empty(&reorder->list) && padata_find_next(pd, false))
+ if (!list_empty(&reorder->list) && padata_find_next(pd, false)) {
+ /*
+ * Other context(eg. the padata_serial_worker) can finish the request.
+ * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish.
+ */
+ padata_get_pd(pd);
queue_work(pinst->serial_wq, &pd->reorder_work);
+ }
}
static void invoke_padata_reorder(struct work_struct *work)
@@ -348,6 +370,8 @@ static void invoke_padata_reorder(struct work_struct *work)
pd = container_of(work, struct parallel_data, reorder_work);
padata_reorder(pd);
local_bh_enable();
+ /* Pairs with putting the reorder_work in the serial_wq */
+ padata_put_pd(pd);
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -380,8 +404,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
}
local_bh_enable();
- if (refcount_sub_and_test(cnt, &pd->refcnt))
- padata_free_pd(pd);
+ padata_put_pd_cnt(pd, cnt);
}
/**
@@ -681,8 +704,7 @@ static int padata_replace(struct padata_instance *pinst)
synchronize_rcu();
list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
- if (refcount_dec_and_test(&ps->opd->refcnt))
- padata_free_pd(ps->opd);
+ padata_put_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
@@ -970,7 +992,7 @@ static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
pinst = kobj2pinst(kobj);
pentry = attr2pentry(attr);
- if (pentry->show)
+ if (pentry->store)
ret = pentry->store(pinst, attr, buf, count);
return ret;
@@ -1121,11 +1143,16 @@ void padata_free_shell(struct padata_shell *ps)
if (!ps)
return;
+ /*
+ * Wait for all _do_serial calls to finish to avoid touching
+ * freed pd's and ps's.
+ */
+ synchronize_rcu();
+
mutex_lock(&ps->pinst->lock);
list_del(&ps->list);
pd = rcu_dereference_protected(ps->pd, 1);
- if (refcount_dec_and_test(&pd->refcnt))
- padata_free_pd(pd);
+ padata_put_pd(pd);
mutex_unlock(&ps->pinst->lock);
kfree(ps);
diff --git a/kernel/panic.c b/kernel/panic.c
index fbc59b3b64d0b..d8635d5cecb25 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -84,7 +84,7 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
EXPORT_SYMBOL(panic_notifier_list);
#ifdef CONFIG_SYSCTL
-static struct ctl_table kern_panic_table[] = {
+static const struct ctl_table kern_panic_table[] = {
#ifdef CONFIG_SMP
{
.procname = "oops_all_cpu_backtrace",
diff --git a/kernel/params.c b/kernel/params.c
index 2e447f8ae183e..0074d29c9b80c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -538,7 +538,7 @@ const struct kernel_param_ops param_ops_string = {
EXPORT_SYMBOL(param_ops_string);
/* sysfs output in /sys/modules/XYZ/parameters/ */
-#define to_module_attr(n) container_of(n, struct module_attribute, attr)
+#define to_module_attr(n) container_of_const(n, struct module_attribute, attr)
#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
struct param_attribute
@@ -555,13 +555,13 @@ struct module_param_attrs
};
#ifdef CONFIG_SYSFS
-#define to_param_attr(n) container_of(n, struct param_attribute, mattr)
+#define to_param_attr(n) container_of_const(n, struct param_attribute, mattr)
-static ssize_t param_attr_show(struct module_attribute *mattr,
+static ssize_t param_attr_show(const struct module_attribute *mattr,
struct module_kobject *mk, char *buf)
{
int count;
- struct param_attribute *attribute = to_param_attr(mattr);
+ const struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->ops->get)
return -EPERM;
@@ -573,12 +573,12 @@ static ssize_t param_attr_show(struct module_attribute *mattr,
}
/* sysfs always hands a nul-terminated string in buf. We rely on that. */
-static ssize_t param_attr_store(struct module_attribute *mattr,
+static ssize_t param_attr_store(const struct module_attribute *mattr,
struct module_kobject *mk,
const char *buf, size_t len)
{
int err;
- struct param_attribute *attribute = to_param_attr(mattr);
+ const struct param_attribute *attribute = to_param_attr(mattr);
if (!attribute->param->ops->set)
return -EPERM;
@@ -857,11 +857,11 @@ static void __init param_sysfs_builtin(void)
}
}
-ssize_t __modver_version_show(struct module_attribute *mattr,
+ssize_t __modver_version_show(const struct module_attribute *mattr,
struct module_kobject *mk, char *buf)
{
- struct module_version_attribute *vattr =
- container_of(mattr, struct module_version_attribute, mattr);
+ const struct module_version_attribute *vattr =
+ container_of_const(mattr, struct module_version_attribute, mattr);
return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version);
}
@@ -892,7 +892,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
struct attribute *attr,
char *buf)
{
- struct module_attribute *attribute;
+ const struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
@@ -911,7 +911,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
struct attribute *attr,
const char *buf, size_t len)
{
- struct module_attribute *attribute;
+ const struct module_attribute *attribute;
struct module_kobject *mk;
int ret;
diff --git a/kernel/pid.c b/kernel/pid.c
index 115448e89c3e9..924084713be8b 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -43,6 +43,7 @@
#include <linux/sched/task.h>
#include <linux/idr.h>
#include <linux/pidfs.h>
+#include <linux/seqlock.h>
#include <net/sock.h>
#include <uapi/linux/pidfd.h>
@@ -60,15 +61,8 @@ struct pid init_struct_pid = {
}, }
};
-int pid_max = PID_MAX_DEFAULT;
-
-int pid_max_min = RESERVED_PIDS + 1;
-int pid_max_max = PID_MAX_LIMIT;
-/*
- * Pseudo filesystems start inode numbering after one. We use Reserved
- * PIDs as a natural offset.
- */
-static u64 pidfs_ino = RESERVED_PIDS;
+static int pid_max_min = RESERVED_PIDS + 1;
+static int pid_max_max = PID_MAX_LIMIT;
/*
* PID-map pages start out as NULL, they get allocated upon
@@ -87,6 +81,7 @@ struct pid_namespace init_pid_ns = {
#ifdef CONFIG_PID_NS
.ns.ops = &pidns_operations,
#endif
+ .pid_max = PID_MAX_DEFAULT,
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
.memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC,
#endif
@@ -108,6 +103,7 @@ EXPORT_SYMBOL_GPL(init_pid_ns);
*/
static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock);
void put_pid(struct pid *pid)
{
@@ -158,6 +154,7 @@ void free_pid(struct pid *pid)
idr_remove(&ns->idr, upid->nr);
}
+ pidfs_remove_pid(pid);
spin_unlock_irqrestore(&pidmap_lock, flags);
call_rcu(&pid->rcu, delayed_put_pid);
@@ -193,6 +190,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
for (i = ns->level; i >= 0; i--) {
int tid = 0;
+ int pid_max = READ_ONCE(tmp->pid_max);
if (set_tid_size) {
tid = set_tid[ns->level - i];
@@ -273,22 +271,24 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
INIT_HLIST_HEAD(&pid->inodes);
upid = pid->numbers + ns->level;
+ idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING))
goto out_unlock;
- pid->stashed = NULL;
- pid->ino = ++pidfs_ino;
+ pidfs_add_pid(pid);
for ( ; upid >= pid->numbers; --upid) {
/* Make the PID visible to find_pid_ns. */
idr_replace(&upid->ns->idr, pid, upid->nr);
upid->ns->pid_allocated++;
}
spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
return pid;
out_unlock:
spin_unlock_irq(&pidmap_lock);
+ idr_preload_end();
put_pid_ns(ns);
out_free:
@@ -644,17 +644,118 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
return fd;
}
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_set *pid_table_root_lookup(struct ctl_table_root *root)
+{
+ return &task_active_pid_ns(current)->set;
+}
+
+static int set_is_seen(struct ctl_table_set *set)
+{
+ return &task_active_pid_ns(current)->set == set;
+}
+
+static int pid_table_root_permissions(struct ctl_table_header *head,
+ const struct ctl_table *table)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ int mode = table->mode;
+
+ if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
+ uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXU) >> 6;
+ else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
+ mode = (mode & S_IRWXG) >> 3;
+ else
+ mode = mode & S_IROTH;
+ return (mode << 6) | (mode << 3) | mode;
+}
+
+static void pid_table_root_set_ownership(struct ctl_table_header *head,
+ kuid_t *uid, kgid_t *gid)
+{
+ struct pid_namespace *pidns =
+ container_of(head->set, struct pid_namespace, set);
+ kuid_t ns_root_uid;
+ kgid_t ns_root_gid;
+
+ ns_root_uid = make_kuid(pidns->user_ns, 0);
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ ns_root_gid = make_kgid(pidns->user_ns, 0);
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+}
+
+static struct ctl_table_root pid_table_root = {
+ .lookup = pid_table_root_lookup,
+ .permissions = pid_table_root_permissions,
+ .set_ownership = pid_table_root_set_ownership,
+};
+
+static const struct ctl_table pid_table[] = {
+ {
+ .procname = "pid_max",
+ .data = &init_pid_ns.pid_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &pid_max_min,
+ .extra2 = &pid_max_max,
+ },
+};
+#endif
+
+int register_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ struct ctl_table *tbl;
+
+ setup_sysctl_set(&pidns->set, &pid_table_root, set_is_seen);
+
+ tbl = kmemdup(pid_table, sizeof(pid_table), GFP_KERNEL);
+ if (!tbl)
+ return -ENOMEM;
+ tbl->data = &pidns->pid_max;
+ pidns->pid_max = min(pid_max_max, max_t(int, pidns->pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+
+ pidns->sysctls = __register_sysctl_table(&pidns->set, "kernel", tbl,
+ ARRAY_SIZE(pid_table));
+ if (!pidns->sysctls) {
+ kfree(tbl);
+ retire_sysctl_set(&pidns->set);
+ return -ENOMEM;
+ }
+#endif
+ return 0;
+}
+
+void unregister_pidns_sysctls(struct pid_namespace *pidns)
+{
+#ifdef CONFIG_SYSCTL
+ const struct ctl_table *tbl;
+
+ tbl = pidns->sysctls->ctl_table_arg;
+ unregister_sysctl_table(pidns->sysctls);
+ retire_sysctl_set(&pidns->set);
+ kfree(tbl);
+#endif
+}
+
void __init pid_idr_init(void)
{
/* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING);
/* bump default and minimum pid_max based on number of cpus */
- pid_max = min(pid_max_max, max_t(int, pid_max,
- PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
+ init_pid_ns.pid_max = min(pid_max_max, max_t(int, init_pid_ns.pid_max,
+ PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
pid_max_min = max_t(int, pid_max_min,
PIDS_PER_CPU_MIN * num_possible_cpus());
- pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+ pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
idr_init(&init_pid_ns.idr);
@@ -665,6 +766,16 @@ void __init pid_idr_init(void)
NULL);
}
+static __init int pid_namespace_sysctl_init(void)
+{
+#ifdef CONFIG_SYSCTL
+ /* "kernel" directory will have already been initialized. */
+ BUG_ON(register_pidns_sysctls(&init_pid_ns));
+#endif
+ return 0;
+}
+subsys_initcall(pid_namespace_sysctl_init);
+
static struct file *__pidfd_fget(struct task_struct *task, int fd)
{
struct file *file;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index d70ab49d5b4a6..8f6cfec87555a 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -70,6 +70,8 @@ static void dec_pid_namespaces(struct ucounts *ucounts)
dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}
+static void destroy_pid_namespace_work(struct work_struct *work);
+
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
struct pid_namespace *parent_pid_ns)
{
@@ -105,17 +107,27 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
goto out_free_idr;
ns->ns.ops = &pidns_operations;
+ ns->pid_max = parent_pid_ns->pid_max;
+ err = register_pidns_sysctls(ns);
+ if (err)
+ goto out_free_inum;
+
refcount_set(&ns->ns.count, 1);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
ns->user_ns = get_user_ns(user_ns);
ns->ucounts = ucounts;
ns->pid_allocated = PIDNS_ADDING;
+ INIT_WORK(&ns->work, destroy_pid_namespace_work);
+
#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE)
ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns);
#endif
+
return ns;
+out_free_inum:
+ ns_free_inum(&ns->ns);
out_free_idr:
idr_destroy(&ns->idr);
kmem_cache_free(pid_ns_cachep, ns);
@@ -137,12 +149,28 @@ static void delayed_free_pidns(struct rcu_head *p)
static void destroy_pid_namespace(struct pid_namespace *ns)
{
+ unregister_pidns_sysctls(ns);
+
ns_free_inum(&ns->ns);
idr_destroy(&ns->idr);
call_rcu(&ns->rcu, delayed_free_pidns);
}
+static void destroy_pid_namespace_work(struct work_struct *work)
+{
+ struct pid_namespace *ns =
+ container_of(work, struct pid_namespace, work);
+
+ do {
+ struct pid_namespace *parent;
+
+ parent = ns->parent;
+ destroy_pid_namespace(ns);
+ ns = parent;
+ } while (ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count));
+}
+
struct pid_namespace *copy_pid_ns(unsigned long flags,
struct user_namespace *user_ns, struct pid_namespace *old_ns)
{
@@ -155,15 +183,8 @@ struct pid_namespace *copy_pid_ns(unsigned long flags,
void put_pid_ns(struct pid_namespace *ns)
{
- struct pid_namespace *parent;
-
- while (ns != &init_pid_ns) {
- parent = ns->parent;
- if (!refcount_dec_and_test(&ns->ns.count))
- break;
- destroy_pid_namespace(ns);
- ns = parent;
- }
+ if (ns && ns != &init_pid_ns && refcount_dec_and_test(&ns->ns.count))
+ schedule_work(&ns->work);
}
EXPORT_SYMBOL_GPL(put_pid_ns);
@@ -274,6 +295,7 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
next = idr_get_cursor(&pid_ns->idr) - 1;
tmp.data = &next;
+ tmp.extra2 = &pid_ns->pid_max;
ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
if (!ret && write)
idr_set_cursor(&pid_ns->idr, next + 1);
@@ -281,15 +303,14 @@ static int pid_ns_ctl_handler(const struct ctl_table *table, int write,
return ret;
}
-extern int pid_max;
-static struct ctl_table pid_ns_ctl_table[] = {
+static const struct ctl_table pid_ns_ctl_table[] = {
{
.procname = "ns_last_pid",
.maxlen = sizeof(int),
.mode = 0666, /* permissions are checked in the handler */
.proc_handler = pid_ns_ctl_handler,
.extra1 = SYSCTL_ZERO,
- .extra2 = &pid_max,
+ .extra2 = &init_pid_ns.pid_max,
},
};
#endif /* CONFIG_CHECKPOINT_RESTORE */
diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h
index 18ecaef6be416..5d8f981de7c56 100644
--- a/kernel/pid_sysctl.h
+++ b/kernel/pid_sysctl.h
@@ -31,7 +31,7 @@ static int pid_mfd_noexec_dointvec_minmax(const struct ctl_table *table,
return err;
}
-static struct ctl_table pid_ns_ctl_table_vm[] = {
+static const struct ctl_table pid_ns_ctl_table_vm[] = {
{
.procname = "memfd_noexec",
.data = &init_pid_ns.memfd_noexec_scope,
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index afce8130d8b92..ca947ed32e3dd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -257,11 +257,30 @@ config DPM_WATCHDOG
boot session.
config DPM_WATCHDOG_TIMEOUT
- int "Watchdog timeout in seconds"
+ int "Watchdog timeout to panic in seconds"
range 1 120
default 120
depends on DPM_WATCHDOG
+config DPM_WATCHDOG_WARNING_TIMEOUT
+ int "Watchdog timeout to warn in seconds"
+ range 1 DPM_WATCHDOG_TIMEOUT
+ default DPM_WATCHDOG_TIMEOUT
+ depends on DPM_WATCHDOG
+ help
+ If the DPM watchdog warning timeout and main timeout are
+ different then a non-fatal warning (with a stack trace of
+ the stuck suspend routine) will be printed when the warning
+ timeout expires. If the suspend routine gets un-stuck
+ before the main timeout expires then no other action is
+ taken. If the routine continues to be stuck and the main
+ timeout expires then an emergency-level message and stack
+ trace will be printed and the system will panic.
+
+ If the warning timeout is equal to the main timeout (the
+ default) then the warning will never happen and the system
+ will jump straight to panic when the main timeout expires.
+
config PM_TRACE
bool
help
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index b29c8aca7486c..865df641b97cc 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -9,7 +9,6 @@
#include <linux/device.h>
#include <linux/mutex.h>
-#include <linux/pm_wakeup.h>
#include "power.h"
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index d07faf42eace6..3874f0e97651e 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -908,3 +908,20 @@ int em_update_performance_limits(struct em_perf_domain *pd,
return 0;
}
EXPORT_SYMBOL_GPL(em_update_performance_limits);
+
+static void rebuild_sd_workfn(struct work_struct *work)
+{
+ rebuild_sched_domains_energy();
+}
+
+void em_rebuild_sched_domains(void)
+{
+ static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
+
+ /*
+ * When called from the cpufreq_register_driver() path, the
+ * cpu_hotplug_lock is already held, so use a work item to
+ * avoid nested locking in rebuild_sched_domains().
+ */
+ schedule_work(&rebuild_sd_work);
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1f87aa01ba44f..10a01af63a807 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -608,7 +608,11 @@ int hibernation_platform_enter(void)
local_irq_disable();
system_state = SYSTEM_SUSPEND;
- syscore_suspend();
+
+ error = syscore_suspend();
+ if (error)
+ goto Enable_irqs;
+
if (pm_wakeup_pending()) {
error = -EAGAIN;
goto Power_up;
@@ -620,6 +624,7 @@ int hibernation_platform_enter(void)
Power_up:
syscore_resume();
+ Enable_irqs:
system_state = SYSTEM_RUNNING;
local_irq_enable();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index de0e6b1077f23..c352dea2f67b5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -110,7 +110,7 @@ extern int hibernate_preallocate_memory(void);
extern void clear_or_poison_free_pages(void);
-/**
+/*
* Auxiliary structure used for reading the snapshot image data and
* metadata from and writing them to the list of page backup entries
* (PBEs) which is the main data structure of swsusp.
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 30894d8f0a781..c9fb559a63993 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1011,11 +1011,8 @@ void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pf
}
}
/* This allocation cannot fail */
- region = memblock_alloc(sizeof(struct nosave_region),
+ region = memblock_alloc_or_panic(sizeof(struct nosave_region),
SMP_CACHE_BYTES);
- if (!region)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index c6bb47666aef6..a91bdf8029671 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -338,3 +338,9 @@ bool printk_get_next_message(struct printk_message *pmsg, u64 seq,
void console_prepend_dropped(struct printk_message *pmsg, unsigned long dropped);
void console_prepend_replay(struct printk_message *pmsg);
#endif
+
+#ifdef CONFIG_SMP
+bool is_printk_cpu_sync_owner(void);
+#else
+static inline bool is_printk_cpu_sync_owner(void) { return false; }
+#endif
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 80910bc3470c2..07668433644b8 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -523,7 +523,7 @@ static struct latched_seq clear_seq = {
/* record buffer */
#define LOG_ALIGN __alignof__(unsigned long)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
-#define LOG_BUF_LEN_MAX (u32)(1 << 31)
+#define LOG_BUF_LEN_MAX ((u32)1 << 31)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
@@ -4922,6 +4922,11 @@ void console_try_replay_all(void)
static atomic_t printk_cpu_sync_owner = ATOMIC_INIT(-1);
static atomic_t printk_cpu_sync_nested = ATOMIC_INIT(0);
+bool is_printk_cpu_sync_owner(void)
+{
+ return (atomic_read(&printk_cpu_sync_owner) == raw_smp_processor_id());
+}
+
/**
* __printk_cpu_sync_wait() - Busy wait until the printk cpu-reentrant
* spinning lock is not owned by any CPU.
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 6f94418d53ffb..32a28f563b137 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -61,10 +61,15 @@ bool is_printk_legacy_deferred(void)
/*
* The per-CPU variable @printk_context can be read safely in any
* context. CPU migration is always disabled when set.
+ *
+ * A context holding the printk_cpu_sync must not spin waiting for
+ * another CPU. For legacy printing, it could be the console_lock
+ * or the port lock.
*/
return (force_legacy_kthread() ||
this_cpu_read(printk_context) ||
- in_nmi());
+ in_nmi() ||
+ is_printk_cpu_sync_owner());
}
asmlinkage int vprintk(const char *fmt, va_list args)
@@ -74,15 +79,6 @@ asmlinkage int vprintk(const char *fmt, va_list args)
if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0))
return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args);
#endif
-
- /*
- * Use the main logbuf even in NMI. But avoid calling console
- * drivers that might have their own locks.
- */
- if (is_printk_legacy_deferred())
- return vprintk_deferred(fmt, args);
-
- /* No obstacles. */
return vprintk_default(fmt, args);
}
EXPORT_SYMBOL(vprintk);
diff --git a/kernel/printk/sysctl.c b/kernel/printk/sysctl.c
index f5072dc85f7aa..da77f3f5c1fe9 100644
--- a/kernel/printk/sysctl.c
+++ b/kernel/printk/sysctl.c
@@ -20,7 +20,7 @@ static int proc_dointvec_minmax_sysadmin(const struct ctl_table *table, int writ
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
-static struct ctl_table printk_sysctls[] = {
+static const struct ctl_table printk_sysctls[] = {
{
.procname = "printk",
.data = &console_loglevel,
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 9b0b52e1836fa..6af90510a1ca7 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -53,6 +53,37 @@ config RCU_TORTURE_TEST
Say M if you want the RCU torture tests to build as a module.
Say N if you are unsure.
+config RCU_TORTURE_TEST_CHK_RDR_STATE
+ tristate "Check rcutorture reader state"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to check the desired rcutorture
+ reader state for each segment against the actual context.
+ Note that PREEMPT_COUNT must be enabled if the preempt-disabled
+ and bh-disabled checks are to take effect, and that PREEMPT_RCU
+ must be enabled for the RCU-nesting checks to take effect.
+ These checks add overhead, and this Kconfig options is therefore
+ disabled by default.
+
+ Say Y here if you want rcutorture reader contexts checked.
+ Say N if you are unsure.
+
+config RCU_TORTURE_TEST_LOG_CPU
+ tristate "Log CPU for rcutorture failures"
+ depends on RCU_TORTURE_TEST
+ default n
+ help
+ This option causes rcutorture to decorate each entry of its
+ log of failure/close-call rcutorture reader segments with the
+ number of the CPU that the reader was running on at the time.
+ This information can be useful, but it does incur additional
+ overhead, overhead that can make both failures and close calls
+ less probable.
+
+ Say Y here if you want CPU IDs logged.
+ Say N if you are unsure.
+
config RCU_REF_SCALE_TEST
tristate "Scalability tests for read-side synchronization (RCU and others)"
depends on DEBUG_KERNEL
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 612d276903352..d26fb1d33ed9a 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -92,12 +92,20 @@ torture_param(bool, gp_cond_exp, false, "Use conditional/async expedited GP wait
torture_param(bool, gp_cond_full, false, "Use conditional/async full-state GP wait primitives");
torture_param(bool, gp_cond_exp_full, false,
"Use conditional/async full-stateexpedited GP wait primitives");
+torture_param(int, gp_cond_wi, 16 * USEC_PER_SEC / HZ,
+ "Wait interval for normal conditional grace periods, us (default 16 jiffies)");
+torture_param(int, gp_cond_wi_exp, 128,
+ "Wait interval for expedited conditional grace periods, us (default 128 us)");
torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives");
torture_param(bool, gp_poll, false, "Use polling GP wait primitives");
torture_param(bool, gp_poll_exp, false, "Use polling expedited GP wait primitives");
torture_param(bool, gp_poll_full, false, "Use polling full-state GP wait primitives");
torture_param(bool, gp_poll_exp_full, false, "Use polling full-state expedited GP wait primitives");
+torture_param(int, gp_poll_wi, 16 * USEC_PER_SEC / HZ,
+ "Wait interval for normal polled grace periods, us (default 16 jiffies)");
+torture_param(int, gp_poll_wi_exp, 128,
+ "Wait interval for expedited polled grace periods, us (default 128 us)");
torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
@@ -109,9 +117,11 @@ torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable");
torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable");
torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)");
+torture_param(int, preempt_duration, 0, "Preemption duration (ms), zero to disable");
+torture_param(int, preempt_interval, MSEC_PER_SEC, "Interval between preemptions (ms)");
torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)");
torture_param(int, read_exit_burst, 16, "# of read-then-exit bursts per episode, zero to disable");
-torture_param(int, reader_flavor, 0x1, "Reader flavors to use, one per bit.");
+torture_param(int, reader_flavor, SRCU_READ_FLAVOR_NORMAL, "Reader flavors to use, one per bit.");
torture_param(int, shuffle_interval, 3, "Number of seconds between shuffles");
torture_param(int, shutdown_secs, 0, "Shutdown time (s), <= zero to disable.");
torture_param(int, stall_cpu, 0, "Stall duration (s), zero to disable.");
@@ -149,6 +159,7 @@ static struct task_struct **fwd_prog_tasks;
static struct task_struct **barrier_cbs_tasks;
static struct task_struct *barrier_task;
static struct task_struct *read_exit_task;
+static struct task_struct *preempt_task;
#define RCU_TORTURE_PIPE_LEN 10
@@ -259,10 +270,13 @@ struct rt_read_seg {
unsigned long rt_delay_ms;
unsigned long rt_delay_us;
bool rt_preempted;
+ int rt_cpu;
+ int rt_end_cpu;
};
static int err_segs_recorded;
static struct rt_read_seg err_segs[RCUTORTURE_RDR_MAX_SEGS];
static int rt_read_nsegs;
+static int rt_read_preempted;
static const char *rcu_torture_writer_state_getname(void)
{
@@ -353,7 +367,8 @@ struct rcu_torture_ops {
void (*read_delay)(struct torture_random_state *rrsp,
struct rt_read_seg *rtrsp);
void (*readunlock)(int idx);
- int (*readlock_held)(void);
+ int (*readlock_held)(void); // lockdep.
+ int (*readlock_nesting)(void); // actual nesting, if available, -1 if not.
unsigned long (*get_gp_seq)(void);
unsigned long (*gp_diff)(unsigned long new, unsigned long old);
void (*deferred_free)(struct rcu_torture *p);
@@ -390,6 +405,7 @@ struct rcu_torture_ops {
void (*get_gp_data)(int *flags, unsigned long *gp_seq);
void (*gp_slow_register)(atomic_t *rgssp);
void (*gp_slow_unregister)(atomic_t *rgssp);
+ bool (*reader_blocked)(void);
long cbflood_max;
int irq_capable;
int can_boost;
@@ -448,10 +464,8 @@ rcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
rtrsp->rt_delay_us = shortdelay_us;
}
if (!preempt_count() &&
- !(torture_random(rrsp) % (nrealreaders * 500))) {
+ !(torture_random(rrsp) % (nrealreaders * 500)))
torture_preempt_schedule(); /* QS only if preemptible. */
- rtrsp->rt_preempted = true;
- }
}
static void rcu_torture_read_unlock(int idx)
@@ -459,6 +473,15 @@ static void rcu_torture_read_unlock(int idx)
rcu_read_unlock();
}
+static int rcu_torture_readlock_nesting(void)
+{
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU))
+ return rcu_preempt_depth();
+ if (IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return (preempt_count() & PREEMPT_MASK);
+ return -1;
+}
+
/*
* Update callback in the pipe. This should be invoked after a grace period.
*/
@@ -548,6 +571,7 @@ static struct rcu_torture_ops rcu_ops = {
.read_delay = rcu_read_delay,
.readunlock = rcu_torture_read_unlock,
.readlock_held = torture_readlock_not_held,
+ .readlock_nesting = rcu_torture_readlock_nesting,
.get_gp_seq = rcu_get_gp_seq,
.gp_diff = rcu_seq_diff,
.deferred_free = rcu_torture_deferred_free,
@@ -573,6 +597,7 @@ static struct rcu_torture_ops rcu_ops = {
.start_gp_poll_exp_full = start_poll_synchronize_rcu_expedited_full,
.poll_gp_state_exp = poll_state_synchronize_rcu,
.cond_sync_exp = cond_synchronize_rcu_expedited,
+ .cond_sync_exp_full = cond_synchronize_rcu_expedited_full,
.call = call_rcu_hurry,
.cb_barrier = rcu_barrier,
.fqs = rcu_force_quiescent_state,
@@ -582,6 +607,9 @@ static struct rcu_torture_ops rcu_ops = {
.get_gp_data = rcutorture_get_gp_data,
.gp_slow_register = rcu_gp_slow_register,
.gp_slow_unregister = rcu_gp_slow_unregister,
+ .reader_blocked = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)
+ ? has_rcu_reader_blocked
+ : NULL,
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
@@ -628,6 +656,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.exp_sync = synchronize_rcu_busted,
.call = call_rcu_busted,
.irq_capable = 1,
+ .extendables = RCUTORTURE_MAX_EXTEND,
.name = "busted"
};
@@ -650,17 +679,17 @@ static int srcu_torture_read_lock(void)
int idx;
int ret = 0;
- if ((reader_flavor & 0x1) || !(reader_flavor & 0x7)) {
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL)) {
idx = srcu_read_lock(srcu_ctlp);
WARN_ON_ONCE(idx & ~0x1);
ret += idx;
}
- if (reader_flavor & 0x2) {
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI) {
idx = srcu_read_lock_nmisafe(srcu_ctlp);
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 1;
}
- if (reader_flavor & 0x4) {
+ if (reader_flavor & SRCU_READ_FLAVOR_LITE) {
idx = srcu_read_lock_lite(srcu_ctlp);
WARN_ON_ONCE(idx & ~0x1);
ret += idx << 2;
@@ -690,11 +719,11 @@ srcu_read_delay(struct torture_random_state *rrsp, struct rt_read_seg *rtrsp)
static void srcu_torture_read_unlock(int idx)
{
WARN_ON_ONCE((reader_flavor && (idx & ~reader_flavor)) || (!reader_flavor && (idx & ~0x1)));
- if (reader_flavor & 0x4)
+ if (reader_flavor & SRCU_READ_FLAVOR_LITE)
srcu_read_unlock_lite(srcu_ctlp, (idx & 0x4) >> 2);
- if (reader_flavor & 0x2)
+ if (reader_flavor & SRCU_READ_FLAVOR_NMI)
srcu_read_unlock_nmisafe(srcu_ctlp, (idx & 0x2) >> 1);
- if ((reader_flavor & 0x1) || !(reader_flavor & 0x7))
+ if ((reader_flavor & SRCU_READ_FLAVOR_NORMAL) || !(reader_flavor & SRCU_READ_FLAVOR_ALL))
srcu_read_unlock(srcu_ctlp, idx & 0x1);
}
@@ -857,7 +886,7 @@ static void synchronize_rcu_trivial(void)
int cpu;
for_each_online_cpu(cpu) {
- torture_sched_setaffinity(current->pid, cpumask_of(cpu));
+ torture_sched_setaffinity(current->pid, cpumask_of(cpu), true);
WARN_ON_ONCE(raw_smp_processor_id() != cpu);
}
}
@@ -1347,6 +1376,7 @@ static void rcu_torture_write_types(void)
pr_alert("%s: gp_sync without primitives.\n", __func__);
}
pr_alert("%s: Testing %d update types.\n", __func__, nsynctypes);
+ pr_info("%s: gp_cond_wi %d gp_cond_wi_exp %d gp_poll_wi %d gp_poll_wi_exp %d\n", __func__, gp_cond_wi, gp_cond_wi_exp, gp_poll_wi, gp_poll_wi_exp);
}
/*
@@ -1513,7 +1543,8 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET:
rcu_torture_writer_state = RTWS_COND_GET;
gp_snap = cur_ops->get_gp_state();
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC;
cur_ops->cond_sync(gp_snap);
rcu_torture_pipe_update(old_rp);
@@ -1521,7 +1552,8 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET_EXP:
rcu_torture_writer_state = RTWS_COND_GET_EXP;
gp_snap = cur_ops->get_gp_state_exp();
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC_EXP;
cur_ops->cond_sync_exp(gp_snap);
rcu_torture_pipe_update(old_rp);
@@ -1529,7 +1561,8 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET_FULL:
rcu_torture_writer_state = RTWS_COND_GET_FULL;
cur_ops->get_gp_state_full(&gp_snap_full);
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC_FULL;
cur_ops->cond_sync_full(&gp_snap_full);
rcu_torture_pipe_update(old_rp);
@@ -1537,7 +1570,8 @@ rcu_torture_writer(void *arg)
case RTWS_COND_GET_EXP_FULL:
rcu_torture_writer_state = RTWS_COND_GET_EXP_FULL;
cur_ops->get_gp_state_full(&gp_snap_full);
- torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_cond_wi_exp,
+ 1000, &rand);
rcu_torture_writer_state = RTWS_COND_SYNC_EXP_FULL;
cur_ops->cond_sync_exp_full(&gp_snap_full);
rcu_torture_pipe_update(old_rp);
@@ -1557,8 +1591,8 @@ rcu_torture_writer(void *arg)
break;
}
WARN_ON_ONCE(ulo_size > 0 && i >= ulo_size);
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi,
+ 1000, &rand);
}
rcu_torture_pipe_update(old_rp);
break;
@@ -1578,8 +1612,8 @@ rcu_torture_writer(void *arg)
break;
}
WARN_ON_ONCE(rgo_size > 0 && i >= rgo_size);
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi,
+ 1000, &rand);
}
rcu_torture_pipe_update(old_rp);
break;
@@ -1588,8 +1622,8 @@ rcu_torture_writer(void *arg)
gp_snap = cur_ops->start_gp_poll_exp();
rcu_torture_writer_state = RTWS_POLL_WAIT_EXP;
while (!cur_ops->poll_gp_state_exp(gp_snap))
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp,
+ 1000, &rand);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_POLL_GET_EXP_FULL:
@@ -1597,8 +1631,8 @@ rcu_torture_writer(void *arg)
cur_ops->start_gp_poll_exp_full(&gp_snap_full);
rcu_torture_writer_state = RTWS_POLL_WAIT_EXP_FULL;
while (!cur_ops->poll_gp_state_full(&gp_snap_full))
- torture_hrtimeout_jiffies(torture_random(&rand) % 16,
- &rand);
+ torture_hrtimeout_us(torture_random(&rand) % gp_poll_wi_exp,
+ 1000, &rand);
rcu_torture_pipe_update(old_rp);
break;
case RTWS_SYNC:
@@ -1835,6 +1869,44 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign.
}
+// Verify the specified RCUTORTURE_RDR* state.
+#define ROEC_ARGS "%s %s: Current %#x To add %#x To remove %#x preempt_count() %#x\n", __func__, s, curstate, new, old, preempt_count()
+static void rcutorture_one_extend_check(char *s, int curstate, int new, int old, bool insoftirq)
+{
+ if (!IS_ENABLED(CONFIG_RCU_TORTURE_TEST_CHK_RDR_STATE))
+ return;
+
+ WARN_ONCE(!(curstate & RCUTORTURE_RDR_IRQ) && irqs_disabled(), ROEC_ARGS);
+ WARN_ONCE((curstate & RCUTORTURE_RDR_IRQ) && !irqs_disabled(), ROEC_ARGS);
+
+ // If CONFIG_PREEMPT_COUNT=n, further checks are unreliable.
+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT))
+ return;
+
+ WARN_ONCE((curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
+ !(preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ WARN_ONCE((curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
+ !(preempt_count() & PREEMPT_MASK), ROEC_ARGS);
+ WARN_ONCE(cur_ops->readlock_nesting &&
+ (curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
+ cur_ops->readlock_nesting() == 0, ROEC_ARGS);
+
+ // Timer handlers have all sorts of stuff disabled, so ignore
+ // unintended disabling.
+ if (insoftirq)
+ return;
+
+ WARN_ONCE(cur_ops->extendables &&
+ !(curstate & (RCUTORTURE_RDR_BH | RCUTORTURE_RDR_RBH)) &&
+ (preempt_count() & SOFTIRQ_MASK), ROEC_ARGS);
+ WARN_ONCE(cur_ops->extendables &&
+ !(curstate & (RCUTORTURE_RDR_PREEMPT | RCUTORTURE_RDR_SCHED)) &&
+ (preempt_count() & PREEMPT_MASK), ROEC_ARGS);
+ WARN_ONCE(cur_ops->readlock_nesting &&
+ !(curstate & (RCUTORTURE_RDR_RCU_1 | RCUTORTURE_RDR_RCU_2)) &&
+ cur_ops->readlock_nesting() > 0, ROEC_ARGS);
+}
+
/*
* Do one extension of an RCU read-side critical section using the
* current reader state in readstate (set to zero for initial entry
@@ -1844,10 +1916,11 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp,
* beginning or end of the critical section and if there was actually a
* change, do a ->read_delay().
*/
-static void rcutorture_one_extend(int *readstate, int newstate,
+static void rcutorture_one_extend(int *readstate, int newstate, bool insoftirq,
struct torture_random_state *trsp,
struct rt_read_seg *rtrsp)
{
+ bool first;
unsigned long flags;
int idxnew1 = -1;
int idxnew2 = -1;
@@ -1856,8 +1929,10 @@ static void rcutorture_one_extend(int *readstate, int newstate,
int statesnew = ~*readstate & newstate;
int statesold = *readstate & ~newstate;
+ first = idxold1 == 0;
WARN_ON_ONCE(idxold2 < 0);
WARN_ON_ONCE(idxold2 & ~RCUTORTURE_RDR_ALLBITS);
+ rcutorture_one_extend_check("before change", idxold1, statesnew, statesold, insoftirq);
rtrsp->rt_readstate = newstate;
/* First, put new protection in place to avoid critical-section gap. */
@@ -1876,6 +1951,21 @@ static void rcutorture_one_extend(int *readstate, int newstate,
if (statesnew & RCUTORTURE_RDR_RCU_2)
idxnew2 = (cur_ops->readlock() << RCUTORTURE_RDR_SHIFT_2) & RCUTORTURE_RDR_MASK_2;
+ // Complain unless both the old and the new protection is in place.
+ rcutorture_one_extend_check("during change",
+ idxold1 | statesnew, statesnew, statesold, insoftirq);
+
+ // Sample CPU under both sets of protections to reduce confusion.
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
+ int cpu = raw_smp_processor_id();
+ rtrsp->rt_cpu = cpu;
+ if (!first) {
+ rtrsp[-1].rt_end_cpu = cpu;
+ if (cur_ops->reader_blocked)
+ rtrsp[-1].rt_preempted = cur_ops->reader_blocked();
+ }
+ }
+
/*
* Next, remove old protection, in decreasing order of strength
* to avoid unlock paths that aren't safe in the stronger
@@ -1926,6 +2016,7 @@ static void rcutorture_one_extend(int *readstate, int newstate,
WARN_ON_ONCE(*readstate < 0);
if (WARN_ON_ONCE(*readstate & ~RCUTORTURE_RDR_ALLBITS))
pr_info("Unexpected readstate value of %#x\n", *readstate);
+ rcutorture_one_extend_check("after change", *readstate, statesnew, statesold, insoftirq);
}
/* Return the biggest extendables mask given current RCU and boot parameters. */
@@ -1992,7 +2083,7 @@ rcutorture_extend_mask(int oldmask, struct torture_random_state *trsp)
* critical section.
*/
static struct rt_read_seg *
-rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
+rcutorture_loop_extend(int *readstate, bool insoftirq, struct torture_random_state *trsp,
struct rt_read_seg *rtrsp)
{
int i;
@@ -2007,7 +2098,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp,
i = ((i | (i >> 3)) & RCUTORTURE_RDR_MAX_LOOPS) + 1;
for (j = 0; j < i; j++) {
mask = rcutorture_extend_mask(*readstate, trsp);
- rcutorture_one_extend(readstate, mask, trsp, &rtrsp[j]);
+ rcutorture_one_extend(readstate, mask, insoftirq, trsp, &rtrsp[j]);
}
return &rtrsp[j];
}
@@ -2028,6 +2119,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
int newstate;
struct rcu_torture *p;
int pipe_count;
+ bool preempted = false;
int readstate = 0;
struct rt_read_seg rtseg[RCUTORTURE_RDR_MAX_SEGS] = { { 0 } };
struct rt_read_seg *rtrsp = &rtseg[0];
@@ -2036,7 +2128,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
WARN_ON_ONCE(!rcu_is_watching());
newstate = rcutorture_extend_mask(readstate, trsp);
- rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++);
+ rcutorture_one_extend(&readstate, newstate, myid < 0, trsp, rtrsp++);
if (checkpolling) {
if (cur_ops->get_gp_state && cur_ops->poll_gp_state)
cookie = cur_ops->get_gp_state();
@@ -2049,13 +2141,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
!cur_ops->readlock_held || cur_ops->readlock_held());
if (p == NULL) {
/* Wait for rcu_torture_writer to get underway */
- rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
+ rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp);
return false;
}
if (p->rtort_mbtest == 0)
atomic_inc(&n_rcu_torture_mberror);
rcu_torture_reader_do_mbchk(myid, p, trsp);
- rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp);
+ rtrsp = rcutorture_loop_extend(&readstate, myid < 0, trsp, rtrsp);
preempt_disable();
pipe_count = READ_ONCE(p->rtort_pipe_count);
if (pipe_count > RCU_TORTURE_PIPE_LEN) {
@@ -2093,7 +2185,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
rcu_torture_writer_state,
cpumask_pr_args(cpu_online_mask));
}
- rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
+ if (cur_ops->reader_blocked)
+ preempted = cur_ops->reader_blocked();
+ rcutorture_one_extend(&readstate, 0, myid < 0, trsp, rtrsp);
WARN_ON_ONCE(readstate);
// This next splat is expected behavior if leakpointer, especially
// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
@@ -2105,6 +2199,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid)
for (rtrsp1 = &rtseg[0]; rtrsp1 < rtrsp; rtrsp1++)
err_segs[i++] = *rtrsp1;
rt_read_nsegs = i;
+ rt_read_preempted = preempted;
}
return true;
@@ -2425,7 +2520,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
"read_exit_delay=%d read_exit_burst=%d "
"reader_flavor=%x "
"nocbs_nthreads=%d nocbs_toggle=%d "
- "test_nmis=%d\n",
+ "test_nmis=%d "
+ "preempt_duration=%d preempt_interval=%d\n",
torture_type, tag, nrealreaders, nfakewriters,
stat_interval, verbose, test_no_idle_hz, shuffle_interval,
stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
@@ -2438,7 +2534,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag)
read_exit_delay, read_exit_burst,
reader_flavor,
nocbs_nthreads, nocbs_toggle,
- test_nmis);
+ test_nmis,
+ preempt_duration, preempt_interval);
}
static int rcutorture_booster_cleanup(unsigned int cpu)
@@ -3068,12 +3165,12 @@ static int __init rcu_torture_fwd_prog_init(void)
fwd_progress = 0;
return 0;
}
- if (stall_cpu > 0) {
- VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall testing");
+ if (stall_cpu > 0 || (preempt_duration > 0 && IS_ENABLED(CONFIG_RCU_NOCB_CPU))) {
+ VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, conflicts with CPU-stall and/or preemption testing");
fwd_progress = 0;
if (IS_MODULE(CONFIG_RCU_TORTURE_TEST))
return -EINVAL; /* In module, can fail back to user. */
- WARN_ON(1); /* Make sure rcutorture notices conflict. */
+ WARN_ON(1); /* Make sure rcutorture scripting notices conflict. */
return 0;
}
if (fwd_progress_holdoff <= 0)
@@ -3418,6 +3515,35 @@ static void rcutorture_test_nmis(int n)
#endif // #else // #if IS_BUILTIN(CONFIG_RCU_TORTURE_TEST)
}
+// Randomly preempt online CPUs.
+static int rcu_torture_preempt(void *unused)
+{
+ int cpu = -1;
+ DEFINE_TORTURE_RANDOM(rand);
+
+ schedule_timeout_idle(stall_cpu_holdoff);
+ do {
+ // Wait for preempt_interval ms with up to 100us fuzz.
+ torture_hrtimeout_ms(preempt_interval, 100, &rand);
+ // Select online CPU.
+ cpu = cpumask_next(cpu, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_next(-1, cpu_online_mask);
+ WARN_ON_ONCE(cpu >= nr_cpu_ids);
+ // Move to that CPU, if can't do so, retry later.
+ if (torture_sched_setaffinity(current->pid, cpumask_of(cpu), false))
+ continue;
+ // Preempt at high-ish priority, then reset to normal.
+ sched_set_fifo(current);
+ torture_sched_setaffinity(current->pid, cpu_present_mask, true);
+ mdelay(preempt_duration);
+ sched_set_normal(current, 0);
+ stutter_wait("rcu_torture_preempt");
+ } while (!torture_must_stop());
+ torture_kthread_stopping("rcu_torture_preempt");
+ return 0;
+}
+
static enum cpuhp_state rcutor_hp;
static void
@@ -3446,6 +3572,7 @@ rcu_torture_cleanup(void)
if (cur_ops->gp_kthread_dbg)
cur_ops->gp_kthread_dbg();
+ torture_stop_kthread(rcu_torture_preempt, preempt_task);
rcu_torture_read_exit_cleanup();
rcu_torture_barrier_cleanup();
rcu_torture_fwd_prog_cleanup();
@@ -3508,26 +3635,49 @@ rcu_torture_cleanup(void)
pr_alert("\t: No segments recorded!!!\n");
firsttime = 1;
for (i = 0; i < rt_read_nsegs; i++) {
- pr_alert("\t%d: %#x ", i, err_segs[i].rt_readstate);
+ pr_alert("\t%d: %#4x", i, err_segs[i].rt_readstate);
if (err_segs[i].rt_delay_jiffies != 0) {
pr_cont("%s%ldjiffies", firsttime ? "" : "+",
err_segs[i].rt_delay_jiffies);
firsttime = 0;
}
+ if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_LOG_CPU)) {
+ pr_cont(" CPU %2d", err_segs[i].rt_cpu);
+ if (err_segs[i].rt_cpu != err_segs[i].rt_end_cpu)
+ pr_cont("->%-2d", err_segs[i].rt_end_cpu);
+ else
+ pr_cont(" ...");
+ }
if (err_segs[i].rt_delay_ms != 0) {
- pr_cont("%s%ldms", firsttime ? "" : "+",
+ pr_cont(" %s%ldms", firsttime ? "" : "+",
err_segs[i].rt_delay_ms);
firsttime = 0;
}
if (err_segs[i].rt_delay_us != 0) {
- pr_cont("%s%ldus", firsttime ? "" : "+",
+ pr_cont(" %s%ldus", firsttime ? "" : "+",
err_segs[i].rt_delay_us);
firsttime = 0;
}
- pr_cont("%s\n",
- err_segs[i].rt_preempted ? "preempted" : "");
+ pr_cont("%s", err_segs[i].rt_preempted ? " preempted" : "");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_BH)
+ pr_cont(" BH");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_IRQ)
+ pr_cont(" IRQ");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_PREEMPT)
+ pr_cont(" PREEMPT");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RBH)
+ pr_cont(" RBH");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_SCHED)
+ pr_cont(" SCHED");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_1)
+ pr_cont(" RCU_1");
+ if (err_segs[i].rt_readstate & RCUTORTURE_RDR_RCU_2)
+ pr_cont(" RCU_2");
+ pr_cont("\n");
}
+ if (rt_read_preempted)
+ pr_alert("\tReader was preempted.\n");
}
if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error)
rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE");
@@ -4019,6 +4169,11 @@ rcu_torture_init(void)
firsterr = rcu_torture_read_exit_init();
if (torture_init_error(firsterr))
goto unwind;
+ if (preempt_duration > 0) {
+ firsterr = torture_create_kthread(rcu_torture_preempt, NULL, preempt_task);
+ if (torture_init_error(firsterr))
+ goto unwind;
+ }
if (object_debug)
rcu_test_debug_objects();
torture_init_end();
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index aacfcc9838b37..1b47376acdc40 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -36,6 +36,7 @@
#include <linux/slab.h>
#include <linux/torture.h>
#include <linux/types.h>
+#include <linux/sched/clock.h>
#include "rcu.h"
@@ -531,6 +532,39 @@ static const struct ref_scale_ops acqrel_ops = {
static volatile u64 stopopts;
+static void ref_sched_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += sched_clock();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_sched_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += sched_clock();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static const struct ref_scale_ops sched_clock_ops = {
+ .readsection = ref_sched_clock_section,
+ .delaysection = ref_sched_clock_delay_section,
+ .name = "sched-clock"
+};
+
+
static void ref_clock_section(const int nloops)
{
u64 x = 0;
@@ -1130,9 +1164,9 @@ ref_scale_init(void)
int firsterr = 0;
static const struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, &srcu_lite_ops, RCU_TRACE_OPS RCU_TASKS_OPS
- &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
- &clock_ops, &jiffies_ops, &typesafe_ref_ops, &typesafe_lock_ops,
- &typesafe_seqlock_ops,
+ &refcnt_ops, &rwlock_ops, &rwsem_ops, &lock_ops, &lock_irq_ops,
+ &acqrel_ops, &sched_clock_ops, &clock_ops, &jiffies_ops,
+ &typesafe_ref_ops, &typesafe_lock_ops, &typesafe_seqlock_ops,
};
if (!torture_init_begin(scale_type, verbose))
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index 5e2e534647946..b83c74c4dcc0d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -738,7 +738,8 @@ EXPORT_SYMBOL_GPL(__srcu_check_read_flavor);
/*
* Counts the new reader in the appropriate per-CPU element of the
* srcu_struct.
- * Returns an index that must be passed to the matching srcu_read_unlock().
+ * Returns a guaranteed non-negative index that must be passed to the
+ * matching __srcu_read_unlock().
*/
int __srcu_read_lock(struct srcu_struct *ssp)
{
@@ -1076,7 +1077,6 @@ static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp,
/* If grace period not already in progress, start it. */
if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) &&
rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) {
- WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed));
srcu_gp_start(ssp);
// And how can that list_add() in the "else" clause
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index b3b3ce34df631..4b3f319114650 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -250,7 +250,7 @@ EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu);
void kvfree_call_rcu(struct rcu_head *head, void *ptr)
{
if (head)
- kasan_record_aux_stack_noalloc(ptr);
+ kasan_record_aux_stack(ptr);
__kvfree_call_rcu(head, ptr);
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index ff98233d4aa59..475f31deed141 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -149,7 +149,6 @@ static int rcu_scheduler_fully_active __read_mostly;
static void rcu_report_qs_rnp(unsigned long mask, struct rcu_node *rnp,
unsigned long gps, unsigned long flags);
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp);
static void invoke_rcu_core(void);
static void rcu_report_exp_rdp(struct rcu_data *rdp);
static void sync_sched_exp_online_cleanup(int cpu);
@@ -186,26 +185,6 @@ static int rcu_unlock_delay;
module_param(rcu_unlock_delay, int, 0444);
#endif
-/*
- * This rcu parameter is runtime-read-only. It reflects
- * a minimum allowed number of objects which can be cached
- * per-CPU. Object size is equal to one page. This value
- * can be changed at boot time.
- */
-static int rcu_min_cached_objs = 5;
-module_param(rcu_min_cached_objs, int, 0444);
-
-// A page shrinker can ask for pages to be freed to make them
-// available for other parts of the system. This usually happens
-// under low memory conditions, and in that case we should also
-// defer page-cache filling for a short time period.
-//
-// The default value is 5 seconds, which is long enough to reduce
-// interference with the shrinker while it asks other systems to
-// drain their caches.
-static int rcu_delay_page_cache_fill_msec = 5000;
-module_param(rcu_delay_page_cache_fill_msec, int, 0444);
-
/* Retrieve RCU kthreads priority for rcutorture */
int rcu_get_gp_kthreads_prio(void)
{
@@ -3083,9 +3062,12 @@ __call_rcu_common(struct rcu_head *head, rcu_callback_t func, bool lazy_in)
}
head->func = func;
head->next = NULL;
- kasan_record_aux_stack_noalloc(head);
+ kasan_record_aux_stack(head);
+
local_irq_save(flags);
rdp = this_cpu_ptr(&rcu_data);
+ RCU_LOCKDEP_WARN(!rcu_rdp_cpu_online(rdp), "Callback enqueued on offline CPU!");
+
lazy = lazy_in && !rcu_async_should_hurry();
/* Add the callback to our list. */
@@ -3191,812 +3173,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func)
}
EXPORT_SYMBOL_GPL(call_rcu);
-/* Maximum number of jiffies to wait before draining a batch. */
-#define KFREE_DRAIN_JIFFIES (5 * HZ)
-#define KFREE_N_BATCHES 2
-#define FREE_N_CHANNELS 2
-
-/**
- * struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
- * @list: List node. All blocks are linked between each other
- * @gp_snap: Snapshot of RCU state for objects placed to this bulk
- * @nr_records: Number of active pointers in the array
- * @records: Array of the kvfree_rcu() pointers
- */
-struct kvfree_rcu_bulk_data {
- struct list_head list;
- struct rcu_gp_oldstate gp_snap;
- unsigned long nr_records;
- void *records[] __counted_by(nr_records);
-};
-
-/*
- * This macro defines how many entries the "records" array
- * will contain. It is based on the fact that the size of
- * kvfree_rcu_bulk_data structure becomes exactly one page.
- */
-#define KVFREE_BULK_MAX_ENTR \
- ((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
-
-/**
- * struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
- * @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
- * @head_free: List of kfree_rcu() objects waiting for a grace period
- * @head_free_gp_snap: Grace-period snapshot to check for attempted premature frees.
- * @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
- * @krcp: Pointer to @kfree_rcu_cpu structure
- */
-
-struct kfree_rcu_cpu_work {
- struct rcu_work rcu_work;
- struct rcu_head *head_free;
- struct rcu_gp_oldstate head_free_gp_snap;
- struct list_head bulk_head_free[FREE_N_CHANNELS];
- struct kfree_rcu_cpu *krcp;
-};
-
-/**
- * struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
- * @head: List of kfree_rcu() objects not yet waiting for a grace period
- * @head_gp_snap: Snapshot of RCU state for objects placed to "@head"
- * @bulk_head: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
- * @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
- * @lock: Synchronize access to this structure
- * @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
- * @initialized: The @rcu_work fields have been initialized
- * @head_count: Number of objects in rcu_head singular list
- * @bulk_count: Number of objects in bulk-list
- * @bkvcache:
- * A simple cache list that contains objects for reuse purpose.
- * In order to save some per-cpu space the list is singular.
- * Even though it is lockless an access has to be protected by the
- * per-cpu lock.
- * @page_cache_work: A work to refill the cache when it is empty
- * @backoff_page_cache_fill: Delay cache refills
- * @work_in_progress: Indicates that page_cache_work is running
- * @hrtimer: A hrtimer for scheduling a page_cache_work
- * @nr_bkv_objs: number of allocated objects at @bkvcache.
- *
- * This is a per-CPU structure. The reason that it is not included in
- * the rcu_data structure is to permit this code to be extracted from
- * the RCU files. Such extraction could allow further optimization of
- * the interactions with the slab allocators.
- */
-struct kfree_rcu_cpu {
- // Objects queued on a linked list
- // through their rcu_head structures.
- struct rcu_head *head;
- unsigned long head_gp_snap;
- atomic_t head_count;
-
- // Objects queued on a bulk-list.
- struct list_head bulk_head[FREE_N_CHANNELS];
- atomic_t bulk_count[FREE_N_CHANNELS];
-
- struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
- raw_spinlock_t lock;
- struct delayed_work monitor_work;
- bool initialized;
-
- struct delayed_work page_cache_work;
- atomic_t backoff_page_cache_fill;
- atomic_t work_in_progress;
- struct hrtimer hrtimer;
-
- struct llist_head bkvcache;
- int nr_bkv_objs;
-};
-
-static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
- .lock = __RAW_SPIN_LOCK_UNLOCKED(krc.lock),
-};
-
-static __always_inline void
-debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
-{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
- int i;
-
- for (i = 0; i < bhead->nr_records; i++)
- debug_rcu_head_unqueue((struct rcu_head *)(bhead->records[i]));
-#endif
-}
-
-static inline struct kfree_rcu_cpu *
-krc_this_cpu_lock(unsigned long *flags)
-{
- struct kfree_rcu_cpu *krcp;
-
- local_irq_save(*flags); // For safely calling this_cpu_ptr().
- krcp = this_cpu_ptr(&krc);
- raw_spin_lock(&krcp->lock);
-
- return krcp;
-}
-
-static inline void
-krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
-{
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
-static inline struct kvfree_rcu_bulk_data *
-get_cached_bnode(struct kfree_rcu_cpu *krcp)
-{
- if (!krcp->nr_bkv_objs)
- return NULL;
-
- WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs - 1);
- return (struct kvfree_rcu_bulk_data *)
- llist_del_first(&krcp->bkvcache);
-}
-
-static inline bool
-put_cached_bnode(struct kfree_rcu_cpu *krcp,
- struct kvfree_rcu_bulk_data *bnode)
-{
- // Check the limit.
- if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
- return false;
-
- llist_add((struct llist_node *) bnode, &krcp->bkvcache);
- WRITE_ONCE(krcp->nr_bkv_objs, krcp->nr_bkv_objs + 1);
- return true;
-}
-
-static int
-drain_page_cache(struct kfree_rcu_cpu *krcp)
-{
- unsigned long flags;
- struct llist_node *page_list, *pos, *n;
- int freed = 0;
-
- if (!rcu_min_cached_objs)
- return 0;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- page_list = llist_del_all(&krcp->bkvcache);
- WRITE_ONCE(krcp->nr_bkv_objs, 0);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- llist_for_each_safe(pos, n, page_list) {
- free_page((unsigned long)pos);
- freed++;
- }
-
- return freed;
-}
-
-static void
-kvfree_rcu_bulk(struct kfree_rcu_cpu *krcp,
- struct kvfree_rcu_bulk_data *bnode, int idx)
-{
- unsigned long flags;
- int i;
-
- if (!WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&bnode->gp_snap))) {
- debug_rcu_bhead_unqueue(bnode);
- rcu_lock_acquire(&rcu_callback_map);
- if (idx == 0) { // kmalloc() / kfree().
- trace_rcu_invoke_kfree_bulk_callback(
- rcu_state.name, bnode->nr_records,
- bnode->records);
-
- kfree_bulk(bnode->nr_records, bnode->records);
- } else { // vmalloc() / vfree().
- for (i = 0; i < bnode->nr_records; i++) {
- trace_rcu_invoke_kvfree_callback(
- rcu_state.name, bnode->records[i], 0);
-
- vfree(bnode->records[i]);
- }
- }
- rcu_lock_release(&rcu_callback_map);
- }
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- if (put_cached_bnode(krcp, bnode))
- bnode = NULL;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- if (bnode)
- free_page((unsigned long) bnode);
-
- cond_resched_tasks_rcu_qs();
-}
-
-static void
-kvfree_rcu_list(struct rcu_head *head)
-{
- struct rcu_head *next;
-
- for (; head; head = next) {
- void *ptr = (void *) head->func;
- unsigned long offset = (void *) head - ptr;
-
- next = head->next;
- debug_rcu_head_unqueue((struct rcu_head *)ptr);
- rcu_lock_acquire(&rcu_callback_map);
- trace_rcu_invoke_kvfree_callback(rcu_state.name, head, offset);
-
- if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset)))
- kvfree(ptr);
-
- rcu_lock_release(&rcu_callback_map);
- cond_resched_tasks_rcu_qs();
- }
-}
-
-/*
- * This function is invoked in workqueue context after a grace period.
- * It frees all the objects queued on ->bulk_head_free or ->head_free.
- */
-static void kfree_rcu_work(struct work_struct *work)
-{
- unsigned long flags;
- struct kvfree_rcu_bulk_data *bnode, *n;
- struct list_head bulk_head[FREE_N_CHANNELS];
- struct rcu_head *head;
- struct kfree_rcu_cpu *krcp;
- struct kfree_rcu_cpu_work *krwp;
- struct rcu_gp_oldstate head_gp_snap;
- int i;
-
- krwp = container_of(to_rcu_work(work),
- struct kfree_rcu_cpu_work, rcu_work);
- krcp = krwp->krcp;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- // Channels 1 and 2.
- for (i = 0; i < FREE_N_CHANNELS; i++)
- list_replace_init(&krwp->bulk_head_free[i], &bulk_head[i]);
-
- // Channel 3.
- head = krwp->head_free;
- krwp->head_free = NULL;
- head_gp_snap = krwp->head_free_gp_snap;
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- // Handle the first two channels.
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- // Start from the tail page, so a GP is likely passed for it.
- list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
- kvfree_rcu_bulk(krcp, bnode, i);
- }
-
- /*
- * This is used when the "bulk" path can not be used for the
- * double-argument of kvfree_rcu(). This happens when the
- * page-cache is empty, which means that objects are instead
- * queued on a linked list through their rcu_head structures.
- * This list is named "Channel 3".
- */
- if (head && !WARN_ON_ONCE(!poll_state_synchronize_rcu_full(&head_gp_snap)))
- kvfree_rcu_list(head);
-}
-
-static bool
-need_offload_krc(struct kfree_rcu_cpu *krcp)
-{
- int i;
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- if (!list_empty(&krcp->bulk_head[i]))
- return true;
-
- return !!READ_ONCE(krcp->head);
-}
-
-static bool
-need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
-{
- int i;
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- if (!list_empty(&krwp->bulk_head_free[i]))
- return true;
-
- return !!krwp->head_free;
-}
-
-static int krc_count(struct kfree_rcu_cpu *krcp)
-{
- int sum = atomic_read(&krcp->head_count);
- int i;
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- sum += atomic_read(&krcp->bulk_count[i]);
-
- return sum;
-}
-
-static void
-__schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
-{
- long delay, delay_left;
-
- delay = krc_count(krcp) >= KVFREE_BULK_MAX_ENTR ? 1:KFREE_DRAIN_JIFFIES;
- if (delayed_work_pending(&krcp->monitor_work)) {
- delay_left = krcp->monitor_work.timer.expires - jiffies;
- if (delay < delay_left)
- mod_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
- return;
- }
- queue_delayed_work(system_unbound_wq, &krcp->monitor_work, delay);
-}
-
-static void
-schedule_delayed_monitor_work(struct kfree_rcu_cpu *krcp)
-{
- unsigned long flags;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- __schedule_delayed_monitor_work(krcp);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-}
-
-static void
-kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
-{
- struct list_head bulk_ready[FREE_N_CHANNELS];
- struct kvfree_rcu_bulk_data *bnode, *n;
- struct rcu_head *head_ready = NULL;
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- INIT_LIST_HEAD(&bulk_ready[i]);
-
- list_for_each_entry_safe_reverse(bnode, n, &krcp->bulk_head[i], list) {
- if (!poll_state_synchronize_rcu_full(&bnode->gp_snap))
- break;
-
- atomic_sub(bnode->nr_records, &krcp->bulk_count[i]);
- list_move(&bnode->list, &bulk_ready[i]);
- }
- }
-
- if (krcp->head && poll_state_synchronize_rcu(krcp->head_gp_snap)) {
- head_ready = krcp->head;
- atomic_set(&krcp->head_count, 0);
- WRITE_ONCE(krcp->head, NULL);
- }
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- for (i = 0; i < FREE_N_CHANNELS; i++) {
- list_for_each_entry_safe(bnode, n, &bulk_ready[i], list)
- kvfree_rcu_bulk(krcp, bnode, i);
- }
-
- if (head_ready)
- kvfree_rcu_list(head_ready);
-}
-
-/*
- * Return: %true if a work is queued, %false otherwise.
- */
-static bool
-kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
-{
- unsigned long flags;
- bool queued = false;
- int i, j;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
-
- // Attempt to start a new batch.
- for (i = 0; i < KFREE_N_BATCHES; i++) {
- struct kfree_rcu_cpu_work *krwp = &(krcp->krw_arr[i]);
-
- // Try to detach bulk_head or head and attach it, only when
- // all channels are free. Any channel is not free means at krwp
- // there is on-going rcu work to handle krwp's free business.
- if (need_wait_for_krwp_work(krwp))
- continue;
-
- // kvfree_rcu_drain_ready() might handle this krcp, if so give up.
- if (need_offload_krc(krcp)) {
- // Channel 1 corresponds to the SLAB-pointer bulk path.
- // Channel 2 corresponds to vmalloc-pointer bulk path.
- for (j = 0; j < FREE_N_CHANNELS; j++) {
- if (list_empty(&krwp->bulk_head_free[j])) {
- atomic_set(&krcp->bulk_count[j], 0);
- list_replace_init(&krcp->bulk_head[j],
- &krwp->bulk_head_free[j]);
- }
- }
-
- // Channel 3 corresponds to both SLAB and vmalloc
- // objects queued on the linked list.
- if (!krwp->head_free) {
- krwp->head_free = krcp->head;
- get_state_synchronize_rcu_full(&krwp->head_free_gp_snap);
- atomic_set(&krcp->head_count, 0);
- WRITE_ONCE(krcp->head, NULL);
- }
-
- // One work is per one batch, so there are three
- // "free channels", the batch can handle. Break
- // the loop since it is done with this CPU thus
- // queuing an RCU work is _always_ success here.
- queued = queue_rcu_work(system_unbound_wq, &krwp->rcu_work);
- WARN_ON_ONCE(!queued);
- break;
- }
- }
-
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
- return queued;
-}
-
-/*
- * This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
- */
-static void kfree_rcu_monitor(struct work_struct *work)
-{
- struct kfree_rcu_cpu *krcp = container_of(work,
- struct kfree_rcu_cpu, monitor_work.work);
-
- // Drain ready for reclaim.
- kvfree_rcu_drain_ready(krcp);
-
- // Queue a batch for a rest.
- kvfree_rcu_queue_batch(krcp);
-
- // If there is nothing to detach, it means that our job is
- // successfully done here. In case of having at least one
- // of the channels that is still busy we should rearm the
- // work to repeat an attempt. Because previous batches are
- // still in progress.
- if (need_offload_krc(krcp))
- schedule_delayed_monitor_work(krcp);
-}
-
-static enum hrtimer_restart
-schedule_page_work_fn(struct hrtimer *t)
-{
- struct kfree_rcu_cpu *krcp =
- container_of(t, struct kfree_rcu_cpu, hrtimer);
-
- queue_delayed_work(system_highpri_wq, &krcp->page_cache_work, 0);
- return HRTIMER_NORESTART;
-}
-
-static void fill_page_cache_func(struct work_struct *work)
-{
- struct kvfree_rcu_bulk_data *bnode;
- struct kfree_rcu_cpu *krcp =
- container_of(work, struct kfree_rcu_cpu,
- page_cache_work.work);
- unsigned long flags;
- int nr_pages;
- bool pushed;
- int i;
-
- nr_pages = atomic_read(&krcp->backoff_page_cache_fill) ?
- 1 : rcu_min_cached_objs;
-
- for (i = READ_ONCE(krcp->nr_bkv_objs); i < nr_pages; i++) {
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
-
- if (!bnode)
- break;
-
- raw_spin_lock_irqsave(&krcp->lock, flags);
- pushed = put_cached_bnode(krcp, bnode);
- raw_spin_unlock_irqrestore(&krcp->lock, flags);
-
- if (!pushed) {
- free_page((unsigned long) bnode);
- break;
- }
- }
-
- atomic_set(&krcp->work_in_progress, 0);
- atomic_set(&krcp->backoff_page_cache_fill, 0);
-}
-
-static void
-run_page_cache_worker(struct kfree_rcu_cpu *krcp)
-{
- // If cache disabled, bail out.
- if (!rcu_min_cached_objs)
- return;
-
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING &&
- !atomic_xchg(&krcp->work_in_progress, 1)) {
- if (atomic_read(&krcp->backoff_page_cache_fill)) {
- queue_delayed_work(system_unbound_wq,
- &krcp->page_cache_work,
- msecs_to_jiffies(rcu_delay_page_cache_fill_msec));
- } else {
- hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- krcp->hrtimer.function = schedule_page_work_fn;
- hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL);
- }
- }
-}
-
-// Record ptr in a page managed by krcp, with the pre-krc_this_cpu_lock()
-// state specified by flags. If can_alloc is true, the caller must
-// be schedulable and not be holding any locks or mutexes that might be
-// acquired by the memory allocator or anything that it might invoke.
-// Returns true if ptr was successfully recorded, else the caller must
-// use a fallback.
-static inline bool
-add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
- unsigned long *flags, void *ptr, bool can_alloc)
-{
- struct kvfree_rcu_bulk_data *bnode;
- int idx;
-
- *krcp = krc_this_cpu_lock(flags);
- if (unlikely(!(*krcp)->initialized))
- return false;
-
- idx = !!is_vmalloc_addr(ptr);
- bnode = list_first_entry_or_null(&(*krcp)->bulk_head[idx],
- struct kvfree_rcu_bulk_data, list);
-
- /* Check if a new block is required. */
- if (!bnode || bnode->nr_records == KVFREE_BULK_MAX_ENTR) {
- bnode = get_cached_bnode(*krcp);
- if (!bnode && can_alloc) {
- krc_this_cpu_unlock(*krcp, *flags);
-
- // __GFP_NORETRY - allows a light-weight direct reclaim
- // what is OK from minimizing of fallback hitting point of
- // view. Apart of that it forbids any OOM invoking what is
- // also beneficial since we are about to release memory soon.
- //
- // __GFP_NOMEMALLOC - prevents from consuming of all the
- // memory reserves. Please note we have a fallback path.
- //
- // __GFP_NOWARN - it is supposed that an allocation can
- // be failed under low memory or high memory pressure
- // scenarios.
- bnode = (struct kvfree_rcu_bulk_data *)
- __get_free_page(GFP_KERNEL | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
- raw_spin_lock_irqsave(&(*krcp)->lock, *flags);
- }
-
- if (!bnode)
- return false;
-
- // Initialize the new block and attach it.
- bnode->nr_records = 0;
- list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
- }
-
- // Finally insert and update the GP for this page.
- bnode->nr_records++;
- bnode->records[bnode->nr_records - 1] = ptr;
- get_state_synchronize_rcu_full(&bnode->gp_snap);
- atomic_inc(&(*krcp)->bulk_count[idx]);
-
- return true;
-}
-
-/*
- * Queue a request for lazy invocation of the appropriate free routine
- * after a grace period. Please note that three paths are maintained,
- * two for the common case using arrays of pointers and a third one that
- * is used only when the main paths cannot be used, for example, due to
- * memory pressure.
- *
- * Each kvfree_call_rcu() request is added to a batch. The batch will be drained
- * every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
- * be free'd in workqueue context. This allows us to: batch requests together to
- * reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
- */
-void kvfree_call_rcu(struct rcu_head *head, void *ptr)
-{
- unsigned long flags;
- struct kfree_rcu_cpu *krcp;
- bool success;
-
- /*
- * Please note there is a limitation for the head-less
- * variant, that is why there is a clear rule for such
- * objects: it can be used from might_sleep() context
- * only. For other places please embed an rcu_head to
- * your data.
- */
- if (!head)
- might_sleep();
-
- // Queue the object but don't yet schedule the batch.
- if (debug_rcu_head_queue(ptr)) {
- // Probable double kfree_rcu(), just leak.
- WARN_ONCE(1, "%s(): Double-freed call. rcu_head %p\n",
- __func__, head);
-
- // Mark as success and leave.
- return;
- }
-
- kasan_record_aux_stack_noalloc(ptr);
- success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head);
- if (!success) {
- run_page_cache_worker(krcp);
-
- if (head == NULL)
- // Inline if kvfree_rcu(one_arg) call.
- goto unlock_return;
-
- head->func = ptr;
- head->next = krcp->head;
- WRITE_ONCE(krcp->head, head);
- atomic_inc(&krcp->head_count);
-
- // Take a snapshot for this krcp.
- krcp->head_gp_snap = get_state_synchronize_rcu();
- success = true;
- }
-
- /*
- * The kvfree_rcu() caller considers the pointer freed at this point
- * and likely removes any references to it. Since the actual slab
- * freeing (and kmemleak_free()) is deferred, tell kmemleak to ignore
- * this object (no scanning or false positives reporting).
- */
- kmemleak_ignore(ptr);
-
- // Set timer to drain after KFREE_DRAIN_JIFFIES.
- if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING)
- __schedule_delayed_monitor_work(krcp);
-
-unlock_return:
- krc_this_cpu_unlock(krcp, flags);
-
- /*
- * Inline kvfree() after synchronize_rcu(). We can do
- * it from might_sleep() context only, so the current
- * CPU can pass the QS state.
- */
- if (!success) {
- debug_rcu_head_unqueue((struct rcu_head *) ptr);
- synchronize_rcu();
- kvfree(ptr);
- }
-}
-EXPORT_SYMBOL_GPL(kvfree_call_rcu);
-
-/**
- * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
- *
- * Note that a single argument of kvfree_rcu() call has a slow path that
- * triggers synchronize_rcu() following by freeing a pointer. It is done
- * before the return from the function. Therefore for any single-argument
- * call that will result in a kfree() to a cache that is to be destroyed
- * during module exit, it is developer's responsibility to ensure that all
- * such calls have returned before the call to kmem_cache_destroy().
- */
-void kvfree_rcu_barrier(void)
-{
- struct kfree_rcu_cpu_work *krwp;
- struct kfree_rcu_cpu *krcp;
- bool queued;
- int i, cpu;
-
- /*
- * Firstly we detach objects and queue them over an RCU-batch
- * for all CPUs. Finally queued works are flushed for each CPU.
- *
- * Please note. If there are outstanding batches for a particular
- * CPU, those have to be finished first following by queuing a new.
- */
- for_each_possible_cpu(cpu) {
- krcp = per_cpu_ptr(&krc, cpu);
-
- /*
- * Check if this CPU has any objects which have been queued for a
- * new GP completion. If not(means nothing to detach), we are done
- * with it. If any batch is pending/running for this "krcp", below
- * per-cpu flush_rcu_work() waits its completion(see last step).
- */
- if (!need_offload_krc(krcp))
- continue;
-
- while (1) {
- /*
- * If we are not able to queue a new RCU work it means:
- * - batches for this CPU are still in flight which should
- * be flushed first and then repeat;
- * - no objects to detach, because of concurrency.
- */
- queued = kvfree_rcu_queue_batch(krcp);
-
- /*
- * Bail out, if there is no need to offload this "krcp"
- * anymore. As noted earlier it can run concurrently.
- */
- if (queued || !need_offload_krc(krcp))
- break;
-
- /* There are ongoing batches. */
- for (i = 0; i < KFREE_N_BATCHES; i++) {
- krwp = &(krcp->krw_arr[i]);
- flush_rcu_work(&krwp->rcu_work);
- }
- }
- }
-
- /*
- * Now we guarantee that all objects are flushed.
- */
- for_each_possible_cpu(cpu) {
- krcp = per_cpu_ptr(&krc, cpu);
-
- /*
- * A monitor work can drain ready to reclaim objects
- * directly. Wait its completion if running or pending.
- */
- cancel_delayed_work_sync(&krcp->monitor_work);
-
- for (i = 0; i < KFREE_N_BATCHES; i++) {
- krwp = &(krcp->krw_arr[i]);
- flush_rcu_work(&krwp->rcu_work);
- }
- }
-}
-EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
-
-static unsigned long
-kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
-{
- int cpu;
- unsigned long count = 0;
-
- /* Snapshot count of all CPUs */
- for_each_possible_cpu(cpu) {
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
- count += krc_count(krcp);
- count += READ_ONCE(krcp->nr_bkv_objs);
- atomic_set(&krcp->backoff_page_cache_fill, 1);
- }
-
- return count == 0 ? SHRINK_EMPTY : count;
-}
-
-static unsigned long
-kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
-{
- int cpu, freed = 0;
-
- for_each_possible_cpu(cpu) {
- int count;
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
- count = krc_count(krcp);
- count += drain_page_cache(krcp);
- kfree_rcu_monitor(&krcp->monitor_work.work);
-
- sc->nr_to_scan -= count;
- freed += count;
-
- if (sc->nr_to_scan <= 0)
- break;
- }
-
- return freed == 0 ? SHRINK_STOP : freed;
-}
-
-void __init kfree_rcu_scheduler_running(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
- if (need_offload_krc(krcp))
- schedule_delayed_monitor_work(krcp);
- }
-}
-
/*
* During early boot, any blocking grace-period wait automatically
* implies a grace period.
@@ -4895,6 +4071,22 @@ rcu_boot_init_percpu_data(int cpu)
rcu_boot_init_nocb_percpu_data(rdp);
}
+static void rcu_thread_affine_rnp(struct task_struct *t, struct rcu_node *rnp)
+{
+ cpumask_var_t affinity;
+ int cpu;
+
+ if (!zalloc_cpumask_var(&affinity, GFP_KERNEL))
+ return;
+
+ for_each_leaf_node_possible_cpu(rnp, cpu)
+ cpumask_set_cpu(cpu, affinity);
+
+ kthread_affine_preferred(t, affinity);
+
+ free_cpumask_var(affinity);
+}
+
struct kthread_worker *rcu_exp_gp_kworker;
static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
@@ -4917,16 +4109,9 @@ static void rcu_spawn_exp_par_gp_kworker(struct rcu_node *rnp)
if (IS_ENABLED(CONFIG_RCU_EXP_KTHREAD))
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, &param);
-}
-
-static struct task_struct *rcu_exp_par_gp_task(struct rcu_node *rnp)
-{
- struct kthread_worker *kworker = READ_ONCE(rnp->exp_kworker);
- if (!kworker)
- return NULL;
-
- return kworker->task;
+ rcu_thread_affine_rnp(kworker->task, rnp);
+ wake_up_process(kworker->task);
}
static void __init rcu_start_exp_gp_kworker(void)
@@ -4934,7 +4119,7 @@ static void __init rcu_start_exp_gp_kworker(void)
const char *name = "rcu_exp_gp_kthread_worker";
struct sched_param param = { .sched_priority = kthread_prio };
- rcu_exp_gp_kworker = kthread_create_worker(0, name);
+ rcu_exp_gp_kworker = kthread_run_worker(0, name);
if (IS_ERR_OR_NULL(rcu_exp_gp_kworker)) {
pr_err("Failed to create %s!\n", name);
rcu_exp_gp_kworker = NULL;
@@ -5012,67 +4197,6 @@ int rcutree_prepare_cpu(unsigned int cpu)
}
/*
- * Update kthreads affinity during CPU-hotplug changes.
- *
- * Set the per-rcu_node kthread's affinity to cover all CPUs that are
- * served by the rcu_node in question. The CPU hotplug lock is still
- * held, so the value of rnp->qsmaskinit will be stable.
- *
- * We don't include outgoingcpu in the affinity set, use -1 if there is
- * no outgoing CPU. If there are no CPUs left in the affinity set,
- * this function allows the kthread to execute on any CPU.
- *
- * Any future concurrent calls are serialized via ->kthread_mutex.
- */
-static void rcutree_affinity_setting(unsigned int cpu, int outgoingcpu)
-{
- cpumask_var_t cm;
- unsigned long mask;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- struct task_struct *task_boost, *task_exp;
-
- rdp = per_cpu_ptr(&rcu_data, cpu);
- rnp = rdp->mynode;
-
- task_boost = rcu_boost_task(rnp);
- task_exp = rcu_exp_par_gp_task(rnp);
-
- /*
- * If CPU is the boot one, those tasks are created later from early
- * initcall since kthreadd must be created first.
- */
- if (!task_boost && !task_exp)
- return;
-
- if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
- return;
-
- mutex_lock(&rnp->kthread_mutex);
- mask = rcu_rnp_online_cpus(rnp);
- for_each_leaf_node_possible_cpu(rnp, cpu)
- if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
- cpu != outgoingcpu)
- cpumask_set_cpu(cpu, cm);
- cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (cpumask_empty(cm)) {
- cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
- if (outgoingcpu >= 0)
- cpumask_clear_cpu(outgoingcpu, cm);
- }
-
- if (task_exp)
- set_cpus_allowed_ptr(task_exp, cm);
-
- if (task_boost)
- set_cpus_allowed_ptr(task_boost, cm);
-
- mutex_unlock(&rnp->kthread_mutex);
-
- free_cpumask_var(cm);
-}
-
-/*
* Has the specified (known valid) CPU ever been fully online?
*/
bool rcu_cpu_beenfullyonline(int cpu)
@@ -5100,7 +4224,6 @@ int rcutree_online_cpu(unsigned int cpu)
if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE)
return 0; /* Too early in boot for scheduler work. */
sync_sched_exp_online_cleanup(cpu);
- rcutree_affinity_setting(cpu, -1);
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
@@ -5317,8 +4440,6 @@ int rcutree_offline_cpu(unsigned int cpu)
rnp->ffmask &= ~rdp->grpmask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- rcutree_affinity_setting(cpu, cpu);
-
// nohz_full CPUs need the tick for stop-machine to work quickly
tick_dep_set(TICK_DEP_BIT_RCU);
return 0;
@@ -5648,62 +4769,12 @@ static void __init rcu_dump_rcu_node_tree(void)
struct workqueue_struct *rcu_gp_wq;
-static void __init kfree_rcu_batch_init(void)
-{
- int cpu;
- int i, j;
- struct shrinker *kfree_rcu_shrinker;
-
- /* Clamp it to [0:100] seconds interval. */
- if (rcu_delay_page_cache_fill_msec < 0 ||
- rcu_delay_page_cache_fill_msec > 100 * MSEC_PER_SEC) {
-
- rcu_delay_page_cache_fill_msec =
- clamp(rcu_delay_page_cache_fill_msec, 0,
- (int) (100 * MSEC_PER_SEC));
-
- pr_info("Adjusting rcutree.rcu_delay_page_cache_fill_msec to %d ms.\n",
- rcu_delay_page_cache_fill_msec);
- }
-
- for_each_possible_cpu(cpu) {
- struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
-
- for (i = 0; i < KFREE_N_BATCHES; i++) {
- INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
- krcp->krw_arr[i].krcp = krcp;
-
- for (j = 0; j < FREE_N_CHANNELS; j++)
- INIT_LIST_HEAD(&krcp->krw_arr[i].bulk_head_free[j]);
- }
-
- for (i = 0; i < FREE_N_CHANNELS; i++)
- INIT_LIST_HEAD(&krcp->bulk_head[i]);
-
- INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor);
- INIT_DELAYED_WORK(&krcp->page_cache_work, fill_page_cache_func);
- krcp->initialized = true;
- }
-
- kfree_rcu_shrinker = shrinker_alloc(0, "rcu-kfree");
- if (!kfree_rcu_shrinker) {
- pr_err("Failed to allocate kfree_rcu() shrinker!\n");
- return;
- }
-
- kfree_rcu_shrinker->count_objects = kfree_rcu_shrink_count;
- kfree_rcu_shrinker->scan_objects = kfree_rcu_shrink_scan;
-
- shrinker_register(kfree_rcu_shrinker);
-}
-
void __init rcu_init(void)
{
int cpu = smp_processor_id();
rcu_early_boot_tests();
- kfree_rcu_batch_init();
rcu_bootup_announce();
sanitize_kthread_prio();
rcu_init_geometry();
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index fb664d3a01c95..77efed89c79e3 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -227,16 +227,16 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_node *rnp, bool wake)
/*
* Report expedited quiescent state for multiple CPUs, all covered by the
- * specified leaf rcu_node structure.
+ * specified leaf rcu_node structure, which is acquired by the caller.
*/
-static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
+static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, unsigned long flags,
unsigned long mask, bool wake)
+ __releases(rnp->lock)
{
int cpu;
- unsigned long flags;
struct rcu_data *rdp;
- raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ raw_lockdep_assert_held_rcu_node(rnp);
if (!(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
@@ -257,8 +257,13 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp,
*/
static void rcu_report_exp_rdp(struct rcu_data *rdp)
{
+ unsigned long flags;
+ struct rcu_node *rnp = rdp->mynode;
+
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
WRITE_ONCE(rdp->cpu_no_qs.b.exp, false);
- rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true);
+ ASSERT_EXCLUSIVE_WRITER(rdp->cpu_no_qs.b.exp);
+ rcu_report_exp_cpu_mult(rnp, flags, rdp->grpmask, true);
}
/* Common code for work-done checking. */
@@ -432,8 +437,10 @@ retry_ipi:
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Report quiescent states for those that went offline. */
- if (mask_ofl_test)
- rcu_report_exp_cpu_mult(rnp, mask_ofl_test, false);
+ if (mask_ofl_test) {
+ raw_spin_lock_irqsave_rcu_node(rnp, flags);
+ rcu_report_exp_cpu_mult(rnp, flags, mask_ofl_test, false);
+ }
}
static void rcu_exp_sel_wait_wake(unsigned long s);
@@ -712,6 +719,18 @@ static void rcu_exp_sel_wait_wake(unsigned long s)
rcu_exp_wait_wake(s);
}
+/* Request an expedited quiescent state. */
+static void rcu_exp_need_qs(void)
+{
+ lockdep_assert_irqs_disabled();
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(*this_cpu_ptr(&rcu_data.cpu_no_qs.b.exp));
+ __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
+ /* Store .exp before .rcu_urgent_qs. */
+ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
+ set_tsk_need_resched(current);
+ set_preempt_need_resched();
+}
+
#ifdef CONFIG_PREEMPT_RCU
/*
@@ -730,24 +749,34 @@ static void rcu_exp_handler(void *unused)
struct task_struct *t = current;
/*
- * First, the common case of not being in an RCU read-side
+ * First, is there no need for a quiescent state from this CPU,
+ * or is this CPU already looking for a quiescent state for the
+ * current grace period? If either is the case, just leave.
+ * However, this should not happen due to the preemptible
+ * sync_sched_exp_online_cleanup() implementation being a no-op,
+ * so warn if this does happen.
+ */
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
+ if (WARN_ON_ONCE(!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
+ READ_ONCE(rdp->cpu_no_qs.b.exp)))
+ return;
+
+ /*
+ * Second, the common case of not being in an RCU read-side
* critical section. If also enabled or idle, immediately
* report the quiescent state, otherwise defer.
*/
if (!depth) {
if (!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK)) ||
- rcu_is_cpu_rrupt_from_idle()) {
+ rcu_is_cpu_rrupt_from_idle())
rcu_report_exp_rdp(rdp);
- } else {
- WRITE_ONCE(rdp->cpu_no_qs.b.exp, true);
- set_tsk_need_resched(t);
- set_preempt_need_resched();
- }
+ else
+ rcu_exp_need_qs();
return;
}
/*
- * Second, the less-common case of being in an RCU read-side
+ * Third, the less-common case of being in an RCU read-side
* critical section. In this case we can count on a future
* rcu_read_unlock(). However, this rcu_read_unlock() might
* execute on some other CPU, but in that case there will be
@@ -768,7 +797,7 @@ static void rcu_exp_handler(void *unused)
return;
}
- // Finally, negative nesting depth should not happen.
+ // Fourth and finally, negative nesting depth should not happen.
WARN_ON_ONCE(1);
}
@@ -835,16 +864,6 @@ static void rcu_exp_print_detail_task_stall_rnp(struct rcu_node *rnp)
#else /* #ifdef CONFIG_PREEMPT_RCU */
-/* Request an expedited quiescent state. */
-static void rcu_exp_need_qs(void)
-{
- __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true);
- /* Store .exp before .rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true);
- set_tsk_need_resched(current);
- set_preempt_need_resched();
-}
-
/* Invoked on each online non-idle CPU for expedited quiescent state. */
static void rcu_exp_handler(void *unused)
{
@@ -852,6 +871,7 @@ static void rcu_exp_handler(void *unused)
struct rcu_node *rnp = rdp->mynode;
bool preempt_bh_enabled = !(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
return;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3927ea5f7955c..3600152b858e8 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -275,6 +275,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rcu_report_exp_rdp(rdp);
else
WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
+ ASSERT_EXCLUSIVE_WRITER_SCOPED(rdp->cpu_no_qs.b.exp);
}
/*
@@ -1217,16 +1218,13 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ rcu_thread_affine_rnp(t, rnp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
}
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
- return READ_ONCE(rnp->boost_kthread_task);
-}
-
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
@@ -1243,10 +1241,6 @@ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
{
}
-static struct task_struct *rcu_boost_task(struct rcu_node *rnp)
-{
- return NULL;
-}
#endif /* #else #ifdef CONFIG_RCU_BOOST */
/*
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index f8436969e0c89..c912b594ba987 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -527,12 +527,12 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
#if IS_ENABLED(CONFIG_RCU_TORTURE_TEST) || IS_MODULE(CONFIG_RCU_TORTURE_TEST) || IS_ENABLED(CONFIG_LOCK_TORTURE_TEST) || IS_MODULE(CONFIG_LOCK_TORTURE_TEST)
/* Get rcutorture access to sched_setaffinity(). */
-long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+long torture_sched_setaffinity(pid_t pid, const struct cpumask *in_mask, bool dowarn)
{
int ret;
ret = sched_setaffinity(pid, in_mask);
- WARN_ONCE(ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
+ WARN_ONCE(dowarn && ret, "%s: sched_setaffinity(%d) returned %d\n", __func__, pid, ret);
return ret;
}
EXPORT_SYMBOL_GPL(torture_sched_setaffinity);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a701000bab347..b5a8569e5d81f 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -1287,7 +1287,7 @@ static struct attribute *reboot_attrs[] = {
};
#ifdef CONFIG_SYSCTL
-static struct ctl_table kern_reboot_table[] = {
+static const struct ctl_table kern_reboot_table[] = {
{
.procname = "poweroff_cmd",
.data = &poweroff_cmd,
diff --git a/kernel/resource.c b/kernel/resource.c
index b7c0e24d93980..12004452d999e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1683,8 +1683,7 @@ void __devm_release_region(struct device *dev, struct resource *parent,
{
struct region_devres match_data = { parent, start, n };
- __release_region(parent, start, n);
- WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match,
+ WARN_ON(devres_release(dev, devm_region_release, devm_region_match,
&match_data));
}
EXPORT_SYMBOL(__devm_release_region);
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 9de6e35fe6791..2cb16091ec0ae 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -13,6 +13,7 @@
#include <linux/syscalls.h>
#include <linux/rseq.h>
#include <linux/types.h>
+#include <linux/ratelimit.h>
#include <asm/ptrace.h>
#define CREATE_TRACE_POINTS
@@ -25,6 +26,78 @@
RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
+#ifdef CONFIG_DEBUG_RSEQ
+static struct rseq *rseq_kernel_fields(struct task_struct *t)
+{
+ return (struct rseq *) t->rseq_fields;
+}
+
+static int rseq_validate_ro_fields(struct task_struct *t)
+{
+ static DEFINE_RATELIMIT_STATE(_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ u32 cpu_id_start, cpu_id, node_id, mm_cid;
+ struct rseq __user *rseq = t->rseq;
+
+ /*
+ * Validate fields which are required to be read-only by
+ * user-space.
+ */
+ if (!user_read_access_begin(rseq, t->rseq_len))
+ goto efault;
+ unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
+ unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
+ unsafe_get_user(node_id, &rseq->node_id, efault_end);
+ unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
+ user_read_access_end();
+
+ if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
+ cpu_id != rseq_kernel_fields(t)->cpu_id ||
+ node_id != rseq_kernel_fields(t)->node_id ||
+ mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
+
+ pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
+ "\tcpu_id_start: %u ?= %u\n"
+ "\tcpu_id: %u ?= %u\n"
+ "\tnode_id: %u ?= %u\n"
+ "\tmm_cid: %u ?= %u\n",
+ t->pid, t->comm,
+ cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
+ cpu_id, rseq_kernel_fields(t)->cpu_id,
+ node_id, rseq_kernel_fields(t)->node_id,
+ mm_cid, rseq_kernel_fields(t)->mm_cid);
+ }
+
+ /* For now, only print a console warning on mismatch. */
+ return 0;
+
+efault_end:
+ user_read_access_end();
+efault:
+ return -EFAULT;
+}
+
+static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
+ u32 node_id, u32 mm_cid)
+{
+ rseq_kernel_fields(t)->cpu_id_start = cpu_id;
+ rseq_kernel_fields(t)->cpu_id = cpu_id;
+ rseq_kernel_fields(t)->node_id = node_id;
+ rseq_kernel_fields(t)->mm_cid = mm_cid;
+}
+#else
+static int rseq_validate_ro_fields(struct task_struct *t)
+{
+ return 0;
+}
+
+static void rseq_set_ro_fields(struct task_struct *t, u32 cpu_id_start, u32 cpu_id,
+ u32 node_id, u32 mm_cid)
+{
+}
+#endif
+
/*
*
* Restartable sequences are a lightweight interface that allows
@@ -92,6 +165,11 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
u32 node_id = cpu_to_node(cpu_id);
u32 mm_cid = task_mm_cid(t);
+ /*
+ * Validate read-only rseq fields.
+ */
+ if (rseq_validate_ro_fields(t))
+ goto efault;
WARN_ON_ONCE((int) mm_cid < 0);
if (!user_write_access_begin(rseq, t->rseq_len))
goto efault;
@@ -105,6 +183,7 @@ static int rseq_update_cpu_node_id(struct task_struct *t)
* t->rseq_len != ORIG_RSEQ_SIZE.
*/
user_write_access_end();
+ rseq_set_ro_fields(t, cpu_id, cpu_id, node_id, mm_cid);
trace_rseq_update(t);
return 0;
@@ -120,6 +199,11 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
mm_cid = 0;
/*
+ * Validate read-only rseq fields.
+ */
+ if (rseq_validate_ro_fields(t))
+ return -EFAULT;
+ /*
* Reset cpu_id_start to its initial state (0).
*/
if (put_user(cpu_id_start, &t->rseq->cpu_id_start))
@@ -141,6 +225,9 @@ static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
*/
if (put_user(mm_cid, &t->rseq->mm_cid))
return -EFAULT;
+
+ rseq_set_ro_fields(t, cpu_id_start, cpu_id, node_id, mm_cid);
+
/*
* Additional feature fields added after ORIG_RSEQ_SIZE
* need to be conditionally reset only if
@@ -420,9 +507,25 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
return -EINVAL;
if (!access_ok(rseq, rseq_len))
return -EFAULT;
+#ifdef CONFIG_DEBUG_RSEQ
+ /*
+ * Initialize the in-kernel rseq fields copy for validation of
+ * read-only fields.
+ */
+ if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
+ get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
+ get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
+ get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
+ return -EFAULT;
+#endif
+ /*
+ * Activate the registration by setting the rseq area address, length
+ * and signature in the task struct.
+ */
current->rseq = rseq;
current->rseq_len = rseq_len;
current->rseq_sig = sig;
+
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index db68a964e34e2..2b331822c7e77 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -9,7 +9,7 @@ static struct autogroup autogroup_default;
static atomic_t autogroup_seq_nr;
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_autogroup_sysctls[] = {
+static const struct ctl_table sched_autogroup_sysctls[] = {
{
.procname = "sched_autogroup_enabled",
.data = &sysctl_sched_autogroup_enabled,
@@ -150,7 +150,7 @@ void sched_autogroup_exit_task(struct task_struct *p)
* see this thread after that: we can no longer use signal->autogroup.
* See the PF_EXITING check in task_wants_autogroup().
*/
- sched_move_task(p);
+ sched_move_task(p, true);
}
static void
@@ -182,7 +182,7 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
* sched_autogroup_exit_task().
*/
for_each_thread(p, t)
- sched_move_task(t);
+ sched_move_task(t, true);
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3e5a6bf587f91..9aecd914ac691 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -740,39 +740,43 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
s64 __maybe_unused steal = 0, irq_delta = 0;
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+ if (irqtime_enabled()) {
+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
- /*
- * Since irq_time is only updated on {soft,}irq_exit, we might run into
- * this case when a previous update_rq_clock() happened inside a
- * {soft,}IRQ region.
- *
- * When this happens, we stop ->clock_task and only update the
- * prev_irq_time stamp to account for the part that fit, so that a next
- * update will consume the rest. This ensures ->clock_task is
- * monotonic.
- *
- * It does however cause some slight miss-attribution of {soft,}IRQ
- * time, a more accurate solution would be to update the irq_time using
- * the current rq->clock timestamp, except that would require using
- * atomic ops.
- */
- if (irq_delta > delta)
- irq_delta = delta;
+ /*
+ * Since irq_time is only updated on {soft,}irq_exit, we might run into
+ * this case when a previous update_rq_clock() happened inside a
+ * {soft,}IRQ region.
+ *
+ * When this happens, we stop ->clock_task and only update the
+ * prev_irq_time stamp to account for the part that fit, so that a next
+ * update will consume the rest. This ensures ->clock_task is
+ * monotonic.
+ *
+ * It does however cause some slight miss-attribution of {soft,}IRQ
+ * time, a more accurate solution would be to update the irq_time using
+ * the current rq->clock timestamp, except that would require using
+ * atomic ops.
+ */
+ if (irq_delta > delta)
+ irq_delta = delta;
- rq->prev_irq_time += irq_delta;
- delta -= irq_delta;
- delayacct_irq(rq->curr, irq_delta);
+ rq->prev_irq_time += irq_delta;
+ delta -= irq_delta;
+ delayacct_irq(rq->curr, irq_delta);
+ }
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {
- steal = paravirt_steal_clock(cpu_of(rq));
+ u64 prev_steal;
+
+ steal = prev_steal = paravirt_steal_clock(cpu_of(rq));
steal -= rq->prev_steal_time_rq;
if (unlikely(steal > delta))
steal = delta;
- rq->prev_steal_time_rq += steal;
+ rq->prev_steal_time_rq = prev_steal;
delta -= steal;
}
#endif
@@ -789,6 +793,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
void update_rq_clock(struct rq *rq)
{
s64 delta;
+ u64 clock;
lockdep_assert_rq_held(rq);
@@ -800,11 +805,14 @@ void update_rq_clock(struct rq *rq)
SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
rq->clock_update_flags |= RQCF_UPDATED;
#endif
+ clock = sched_clock_cpu(cpu_of(rq));
+ scx_rq_clock_update(rq, clock);
- delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ delta = clock - rq->clock;
if (delta < 0)
return;
rq->clock += delta;
+
update_rq_clock_task(rq, delta);
}
@@ -1055,9 +1063,10 @@ void wake_up_q(struct wake_q_head *head)
struct task_struct *task;
task = container_of(node, struct task_struct, wake_q);
- /* Task can safely be re-inserted now: */
node = node->next;
- task->wake_q.next = NULL;
+ /* pairs with cmpxchg_relaxed() in __wake_q_add() */
+ WRITE_ONCE(task->wake_q.next, NULL);
+ /* Task can safely be re-inserted now. */
/*
* wake_up_process() executes a full barrier, which pairs with
@@ -1168,13 +1177,13 @@ int get_nohz_timer_target(void)
struct sched_domain *sd;
const struct cpumask *hk_mask;
- if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) {
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
- hk_mask = housekeeping_cpumask(HK_TYPE_TIMER);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
guard(rcu)();
@@ -1189,7 +1198,7 @@ int get_nohz_timer_target(void)
}
if (default_cpu == -1)
- default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER);
+ default_cpu = housekeeping_any_cpu(HK_TYPE_KERNEL_NOISE);
return default_cpu;
}
@@ -1341,7 +1350,7 @@ bool sched_can_stop_tick(struct rq *rq)
if (scx_enabled() && !scx_can_stop_tick(rq))
return false;
- if (rq->cfs.h_nr_running > 1)
+ if (rq->cfs.h_nr_queued > 1)
return false;
/*
@@ -3534,7 +3543,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
*
* More yuck to audit.
*/
- do_set_cpus_allowed(p, task_cpu_possible_mask(p));
+ do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
@@ -4646,7 +4655,7 @@ static int sysctl_schedstats(const struct ctl_table *table, int write, void *buf
#endif /* CONFIG_SCHEDSTATS */
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_core_sysctls[] = {
+static const struct ctl_table sched_core_sysctls[] = {
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
@@ -5632,7 +5641,7 @@ void sched_tick(void)
unsigned long hw_pressure;
u64 resched_latency;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
arch_scale_freq_tick();
sched_clock_tick();
@@ -5771,7 +5780,7 @@ static void sched_tick_start(int cpu)
int os;
struct tick_work *twork;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -5792,7 +5801,7 @@ static void sched_tick_stop(int cpu)
struct tick_work *twork;
int os;
- if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ if (housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
return;
WARN_ON_ONCE(!tick_work_cpu);
@@ -6018,7 +6027,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* opportunity to pull in more work from other CPUs.
*/
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
- rq->nr_running == rq->cfs.h_nr_running)) {
+ rq->nr_running == rq->cfs.h_nr_queued)) {
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
@@ -6641,7 +6650,6 @@ static void __sched notrace __schedule(int sched_mode)
* as a preemption by schedule_debug() and RCU.
*/
bool preempt = sched_mode > SM_NONE;
- bool block = false;
unsigned long *switch_count;
unsigned long prev_state;
struct rq_flags rf;
@@ -6702,7 +6710,7 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- block = try_to_block_task(rq, prev, prev_state);
+ try_to_block_task(rq, prev, prev_state);
switch_count = &prev->nvcsw;
}
@@ -6748,7 +6756,8 @@ picked:
migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
- psi_sched_switch(prev, next, block);
+ psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
+ prev->se.sched_delayed);
trace_sched_switch(preempt, prev, next, prev_state);
@@ -7701,9 +7710,9 @@ void sched_show_task(struct task_struct *p)
if (pid_alive(p))
ppid = task_pid_nr(rcu_dereference(p->real_parent));
rcu_read_unlock();
- pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n",
+ pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d task_flags:0x%04x flags:0x%08lx\n",
free, task_pid_nr(p), task_tgid_nr(p),
- ppid, read_task_thread_flags(p));
+ ppid, p->flags, read_task_thread_flags(p));
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
@@ -7930,19 +7939,26 @@ void sched_setnuma(struct task_struct *p, int nid)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Ensure that the idle task is using init_mm right before its CPU goes
- * offline.
+ * Invoked on the outgoing CPU in context of the CPU hotplug thread
+ * after ensuring that there are no user space tasks left on the CPU.
+ *
+ * If there is a lazy mm in use on the hotplug thread, drop it and
+ * switch to init_mm.
+ *
+ * The reference count on init_mm is dropped in finish_cpu().
*/
-void idle_task_exit(void)
+static void sched_force_init_mm(void)
{
struct mm_struct *mm = current->active_mm;
- BUG_ON(cpu_online(smp_processor_id()));
- BUG_ON(current != this_rq()->idle);
-
if (mm != &init_mm) {
- switch_mm(mm, &init_mm, current);
+ mmgrab_lazy_tlb(&init_mm);
+ local_irq_disable();
+ current->active_mm = &init_mm;
+ switch_mm_irqs_off(mm, &init_mm, current);
+ local_irq_enable();
finish_arch_post_lock_switch();
+ mmdrop_lazy_tlb(mm);
}
/* finish_cpu(), as ran on the BP, will clean up the active_mm state */
@@ -8180,19 +8196,14 @@ static void cpuset_cpu_active(void)
cpuset_update_active_cpus();
}
-static int cpuset_cpu_inactive(unsigned int cpu)
+static void cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- int ret = dl_bw_check_overflow(cpu);
-
- if (ret)
- return ret;
cpuset_update_active_cpus();
} else {
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
}
- return 0;
}
static inline void sched_smt_present_inc(int cpu)
@@ -8254,6 +8265,11 @@ int sched_cpu_deactivate(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
int ret;
+ ret = dl_bw_deactivate(cpu);
+
+ if (ret)
+ return ret;
+
/*
* Remove CPU from nohz.idle_cpus_mask to prevent participating in
* load balancing when not active
@@ -8299,15 +8315,7 @@ int sched_cpu_deactivate(unsigned int cpu)
return 0;
sched_update_numa(cpu, false);
- ret = cpuset_cpu_inactive(cpu);
- if (ret) {
- sched_smt_present_inc(cpu);
- sched_set_rq_online(rq, cpu);
- balance_push_set(cpu, false);
- set_cpu_active(cpu, true);
- sched_update_numa(cpu, true);
- return ret;
- }
+ cpuset_cpu_inactive(cpu);
sched_domains_numa_masks_clear(cpu);
return 0;
}
@@ -8344,6 +8352,7 @@ int sched_cpu_starting(unsigned int cpu)
int sched_cpu_wait_empty(unsigned int cpu)
{
balance_hotplug_wait();
+ sched_force_init_mm();
return 0;
}
@@ -9042,7 +9051,7 @@ static void sched_change_group(struct task_struct *tsk, struct task_group *group
* now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
* its new group.
*/
-void sched_move_task(struct task_struct *tsk)
+void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
@@ -9071,7 +9080,8 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk);
sched_change_group(tsk, group);
- scx_move_task(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -9172,7 +9182,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
struct cgroup_subsys_state *css;
cgroup_taskset_for_each(task, css, tset)
- sched_move_task(task);
+ sched_move_task(task, false);
scx_cgroup_finish_attach();
}
@@ -10590,7 +10600,7 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
return;
/* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
+ task_work_add(curr, work, TWA_RESUME);
}
void sched_mm_cid_exit_signals(struct task_struct *t)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 28c77904ea749..1a19d69b91ed3 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -83,7 +83,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
if (unlikely(sg_policy->limits_changed)) {
sg_policy->limits_changed = false;
- sg_policy->need_freq_update = true;
+ sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
return true;
}
@@ -96,7 +96,7 @@ static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
unsigned int next_freq)
{
if (sg_policy->need_freq_update)
- sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
+ sg_policy->need_freq_update = false;
else if (sg_policy->next_freq == next_freq)
return false;
@@ -604,31 +604,6 @@ static const struct kobj_type sugov_tunables_ktype = {
/********************** cpufreq governor interface *********************/
-#ifdef CONFIG_ENERGY_MODEL
-static void rebuild_sd_workfn(struct work_struct *work)
-{
- rebuild_sched_domains_energy();
-}
-
-static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
-
-/*
- * EAS shouldn't be attempted without sugov, so rebuild the sched_domains
- * on governor changes to make sure the scheduler knows about it.
- */
-static void sugov_eas_rebuild_sd(void)
-{
- /*
- * When called from the cpufreq_register_driver() path, the
- * cpu_hotplug_lock is already held, so use a work item to
- * avoid nested locking in rebuild_sched_domains().
- */
- schedule_work(&rebuild_sd_work);
-}
-#else
-static inline void sugov_eas_rebuild_sd(void) { };
-#endif
-
struct cpufreq_governor schedutil_gov;
static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
@@ -691,7 +666,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
}
sg_policy->thread = thread;
- kthread_bind_mask(thread, policy->related_cpus);
+ if (policy->dvfs_possible_from_any_cpu)
+ set_cpus_allowed_ptr(thread, policy->related_cpus);
+ else
+ kthread_bind_mask(thread, policy->related_cpus);
+
init_irq_work(&sg_policy->irq_work, sugov_irq_work);
mutex_init(&sg_policy->work_lock);
@@ -784,7 +763,11 @@ static int sugov_init(struct cpufreq_policy *policy)
goto fail;
out:
- sugov_eas_rebuild_sd();
+ /*
+ * Schedutil is the preferred governor for EAS, so rebuild sched domains
+ * on governor changes to make sure the scheduler knows about them.
+ */
+ em_rebuild_sched_domains();
mutex_unlock(&global_tunables_lock);
return 0;
@@ -826,7 +809,7 @@ static void sugov_exit(struct cpufreq_policy *policy)
sugov_policy_free(sg_policy);
cpufreq_disable_fast_switch(policy);
- sugov_eas_rebuild_sd();
+ em_rebuild_sched_domains();
}
static int sugov_start(struct cpufreq_policy *policy)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 0bed0fa1acd98..5d9143dd08791 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -9,6 +9,8 @@
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+DEFINE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
/*
* There are no locks covering percpu hardirq/softirq time.
* They are only modified in vtime_account, on corresponding CPU
@@ -22,16 +24,14 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-static int sched_clock_irqtime;
-
void enable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 1;
+ static_branch_enable(&sched_clock_irqtime);
}
void disable_sched_clock_irqtime(void)
{
- sched_clock_irqtime = 0;
+ static_branch_disable(&sched_clock_irqtime);
}
static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
@@ -57,7 +57,7 @@ void irqtime_account_irq(struct task_struct *curr, unsigned int offset)
s64 delta;
int cpu;
- if (!sched_clock_irqtime)
+ if (!irqtime_enabled())
return;
cpu = smp_processor_id();
@@ -90,8 +90,6 @@ static u64 irqtime_tick_accounted(u64 maxtime)
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-#define sched_clock_irqtime (0)
-
static u64 irqtime_tick_accounted(u64 dummy)
{
return 0;
@@ -478,7 +476,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
if (vtime_accounting_enabled_this_cpu())
return;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_process_tick(p, user_tick, 1);
return;
}
@@ -507,7 +505,7 @@ void account_idle_ticks(unsigned long ticks)
{
u64 cputime, steal;
- if (sched_clock_irqtime) {
+ if (irqtime_enabled()) {
irqtime_account_idle_ticks(ticks);
return;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index d94f2ed6d1f46..38e4537790af7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -26,7 +26,7 @@
static unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
static unsigned int sysctl_sched_dl_period_min = 100; /* 100 us */
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_dl_sysctls[] = {
+static const struct ctl_table sched_dl_sysctls[] = {
{
.procname = "sched_deadline_period_max_us",
.data = &sysctl_sched_dl_period_max,
@@ -342,6 +342,29 @@ static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_s
__add_rq_bw(new_bw, &rq->dl);
}
+static __always_inline
+void cancel_dl_timer(struct sched_dl_entity *dl_se, struct hrtimer *timer)
+{
+ /*
+ * If the timer callback was running (hrtimer_try_to_cancel == -1),
+ * it will eventually call put_task_struct().
+ */
+ if (hrtimer_try_to_cancel(timer) == 1 && !dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+}
+
+static __always_inline
+void cancel_replenish_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->dl_timer);
+}
+
+static __always_inline
+void cancel_inactive_timer(struct sched_dl_entity *dl_se)
+{
+ cancel_dl_timer(dl_se, &dl_se->inactive_timer);
+}
+
static void dl_change_utilization(struct task_struct *p, u64 new_bw)
{
WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
@@ -495,10 +518,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
- if (!dl_server(dl_se))
- put_task_struct(dl_task_of(dl_se));
- }
+ cancel_inactive_timer(dl_se);
} else {
/*
* Since "dl_non_contending" is not set, the
@@ -2115,13 +2135,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
- *
- * If the timer callback was running (hrtimer_try_to_cancel == -1),
- * it will eventually call put_task_struct().
*/
- if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 &&
- !dl_server(&p->dl))
- put_task_struct(p);
+ cancel_replenish_timer(&p->dl);
p->dl.dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) {
@@ -2289,8 +2304,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ cancel_inactive_timer(&p->dl);
}
sub_rq_bw(&p->dl, &rq->dl);
rq_unlock(rq, &rf);
@@ -2506,16 +2520,13 @@ static struct task_struct *pick_earliest_pushable_dl_task(struct rq *rq, int cpu
return NULL;
next_node = rb_first_cached(&rq->dl.pushable_dl_tasks_root);
-
-next_node:
- if (next_node) {
+ while (next_node) {
p = __node_2_pdl(next_node);
if (task_is_pushable(rq, p, cpu))
return p;
next_node = rb_next(next_node);
- goto next_node;
}
return NULL;
@@ -2964,11 +2975,22 @@ void dl_add_task_root_domain(struct task_struct *p)
void dl_clear_root_domain(struct root_domain *rd)
{
- unsigned long flags;
+ int i;
- raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
+ guard(raw_spinlock_irqsave)(&rd->dl_bw.lock);
rd->dl_bw.total_bw = 0;
- raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
+
+ /*
+ * dl_server bandwidth is only restored when CPUs are attached to root
+ * domains (after domains are created or CPUs moved back to the
+ * default root doamin).
+ */
+ for_each_cpu(i, rd->span) {
+ struct sched_dl_entity *dl_se = &cpu_rq(i)->fair_server;
+
+ if (dl_server(dl_se) && cpu_active(i))
+ rd->dl_bw.total_bw += dl_se->dl_bw;
+ }
}
#endif /* CONFIG_SMP */
@@ -3029,8 +3051,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ cancel_inactive_timer(&p->dl);
/*
* In case a task is setscheduled to SCHED_DEADLINE we need to keep
@@ -3453,29 +3474,31 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
}
enum dl_bw_request {
- dl_bw_req_check_overflow = 0,
+ dl_bw_req_deactivate = 0,
dl_bw_req_alloc,
dl_bw_req_free
};
static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
- unsigned long flags;
+ unsigned long flags, cap;
struct dl_bw *dl_b;
bool overflow = 0;
+ u64 fair_server_bw = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (req == dl_bw_req_free) {
+ cap = dl_bw_capacity(cpu);
+ switch (req) {
+ case dl_bw_req_free:
__dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
- } else {
- unsigned long cap = dl_bw_capacity(cpu);
-
+ break;
+ case dl_bw_req_alloc:
overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
- if (req == dl_bw_req_alloc && !overflow) {
+ if (!overflow) {
/*
* We reserve space in the destination
* root_domain, as we can't fail after this point.
@@ -3484,6 +3507,42 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
*/
__dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
}
+ break;
+ case dl_bw_req_deactivate:
+ /*
+ * cpu is not off yet, but we need to do the math by
+ * considering it off already (i.e., what would happen if we
+ * turn cpu off?).
+ */
+ cap -= arch_scale_cpu_capacity(cpu);
+
+ /*
+ * cpu is going offline and NORMAL tasks will be moved away
+ * from it. We can thus discount dl_server bandwidth
+ * contribution as it won't need to be servicing tasks after
+ * the cpu is off.
+ */
+ if (cpu_rq(cpu)->fair_server.dl_server)
+ fair_server_bw = cpu_rq(cpu)->fair_server.dl_bw;
+
+ /*
+ * Not much to check if no DEADLINE bandwidth is present.
+ * dl_servers we can discount, as tasks will be moved out the
+ * offlined CPUs anyway.
+ */
+ if (dl_b->total_bw - fair_server_bw > 0) {
+ /*
+ * Leaving at least one CPU for DEADLINE tasks seems a
+ * wise thing to do. As said above, cpu is not offline
+ * yet, so account for that.
+ */
+ if (dl_bw_cpus(cpu) - 1)
+ overflow = __dl_overflow(dl_b, cap, fair_server_bw, 0);
+ else
+ overflow = 1;
+ }
+
+ break;
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@ -3492,9 +3551,9 @@ static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
return overflow ? -EBUSY : 0;
}
-int dl_bw_check_overflow(int cpu)
+int dl_bw_deactivate(int cpu)
{
- return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+ return dl_bw_manage(dl_bw_req_deactivate, cpu, 0);
}
int dl_bw_alloc(int cpu, u64 dl_bw)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a1be00a988bf6..ef047add7f9e6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -379,7 +379,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
return -EINVAL;
}
- if (rq->cfs.h_nr_running) {
+ if (rq->cfs.h_nr_queued) {
update_rq_clock(rq);
dl_server_stop(&rq->fair_server);
}
@@ -392,7 +392,7 @@ static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubu
printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
cpu_of(rq));
- if (rq->cfs.h_nr_running)
+ if (rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
}
@@ -843,13 +843,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SPLIT_NS(right_vruntime));
spread = right_vruntime - left_vruntime;
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
- SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
- SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
- SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed);
- SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running",
- cfs_rq->idle_nr_running);
- SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
- cfs_rq->idle_h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "nr_queued", cfs_rq->nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_runnable", cfs_rq->h_nr_runnable);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_queued", cfs_rq->h_nr_queued);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_idle", cfs_rq->h_nr_idle);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
@@ -1265,6 +1262,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
if (task_has_dl_policy(p)) {
P(dl.runtime);
P(dl.deadline);
+ } else if (fair_policy(p->policy)) {
+ P(se.slice);
}
#ifdef CONFIG_SCHED_CLASS_EXT
__PS("ext.enabled", task_on_scx(p));
@@ -1295,8 +1294,10 @@ void resched_latency_warn(int cpu, u64 latency)
{
static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
- WARN(__ratelimit(&latency_check_ratelimit),
- "sched: CPU %d need_resched set for > %llu ns (%d ticks) "
- "without schedule\n",
- cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+ if (likely(!__ratelimit(&latency_check_ratelimit)))
+ return;
+
+ pr_err("sched: CPU %d need_resched set for > %llu ns (%d ticks) without schedule\n",
+ cpu, latency, cpu_rq(cpu)->ticks_without_resched);
+ dump_stack();
}
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 7fff1d0454770..5a81d9a1e31f2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -123,6 +123,19 @@ enum scx_ops_flags {
SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
/*
+ * A migration disabled task can only execute on its current CPU. By
+ * default, such tasks are automatically put on the CPU's local DSQ with
+ * the default slice on enqueue. If this ops flag is set, they also go
+ * through ops.enqueue().
+ *
+ * A migration disabled task never invokes ops.select_cpu() as it can
+ * only select the current CPU. Also, p->cpus_ptr will only contain its
+ * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
+ * and thus may disagree with cpumask_weight(p->cpus_ptr).
+ */
+ SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
+
+ /*
* CPU cgroup support flags
*/
SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */
@@ -130,6 +143,7 @@ enum scx_ops_flags {
SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING |
+ SCX_OPS_ENQ_MIGRATION_DISABLED |
SCX_OPS_SWITCH_PARTIAL |
SCX_OPS_HAS_CGROUP_WEIGHT,
};
@@ -206,7 +220,7 @@ struct scx_dump_ctx {
*/
struct sched_ext_ops {
/**
- * select_cpu - Pick the target CPU for a task which is being woken up
+ * @select_cpu: Pick the target CPU for a task which is being woken up
* @p: task being woken up
* @prev_cpu: the cpu @p was on before sleeping
* @wake_flags: SCX_WAKE_*
@@ -233,7 +247,7 @@ struct sched_ext_ops {
s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
/**
- * enqueue - Enqueue a task on the BPF scheduler
+ * @enqueue: Enqueue a task on the BPF scheduler
* @p: task being enqueued
* @enq_flags: %SCX_ENQ_*
*
@@ -248,7 +262,7 @@ struct sched_ext_ops {
void (*enqueue)(struct task_struct *p, u64 enq_flags);
/**
- * dequeue - Remove a task from the BPF scheduler
+ * @dequeue: Remove a task from the BPF scheduler
* @p: task being dequeued
* @deq_flags: %SCX_DEQ_*
*
@@ -264,7 +278,7 @@ struct sched_ext_ops {
void (*dequeue)(struct task_struct *p, u64 deq_flags);
/**
- * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
+ * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs
* @cpu: CPU to dispatch tasks for
* @prev: previous task being switched out
*
@@ -287,7 +301,7 @@ struct sched_ext_ops {
void (*dispatch)(s32 cpu, struct task_struct *prev);
/**
- * tick - Periodic tick
+ * @tick: Periodic tick
* @p: task running currently
*
* This operation is called every 1/HZ seconds on CPUs which are
@@ -297,7 +311,7 @@ struct sched_ext_ops {
void (*tick)(struct task_struct *p);
/**
- * runnable - A task is becoming runnable on its associated CPU
+ * @runnable: A task is becoming runnable on its associated CPU
* @p: task becoming runnable
* @enq_flags: %SCX_ENQ_*
*
@@ -324,7 +338,7 @@ struct sched_ext_ops {
void (*runnable)(struct task_struct *p, u64 enq_flags);
/**
- * running - A task is starting to run on its associated CPU
+ * @running: A task is starting to run on its associated CPU
* @p: task starting to run
*
* See ->runnable() for explanation on the task state notifiers.
@@ -332,7 +346,7 @@ struct sched_ext_ops {
void (*running)(struct task_struct *p);
/**
- * stopping - A task is stopping execution
+ * @stopping: A task is stopping execution
* @p: task stopping to run
* @runnable: is task @p still runnable?
*
@@ -343,7 +357,7 @@ struct sched_ext_ops {
void (*stopping)(struct task_struct *p, bool runnable);
/**
- * quiescent - A task is becoming not runnable on its associated CPU
+ * @quiescent: A task is becoming not runnable on its associated CPU
* @p: task becoming not runnable
* @deq_flags: %SCX_DEQ_*
*
@@ -363,7 +377,7 @@ struct sched_ext_ops {
void (*quiescent)(struct task_struct *p, u64 deq_flags);
/**
- * yield - Yield CPU
+ * @yield: Yield CPU
* @from: yielding task
* @to: optional yield target task
*
@@ -378,7 +392,7 @@ struct sched_ext_ops {
bool (*yield)(struct task_struct *from, struct task_struct *to);
/**
- * core_sched_before - Task ordering for core-sched
+ * @core_sched_before: Task ordering for core-sched
* @a: task A
* @b: task B
*
@@ -396,7 +410,7 @@ struct sched_ext_ops {
bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);
/**
- * set_weight - Set task weight
+ * @set_weight: Set task weight
* @p: task to set weight for
* @weight: new weight [1..10000]
*
@@ -405,7 +419,7 @@ struct sched_ext_ops {
void (*set_weight)(struct task_struct *p, u32 weight);
/**
- * set_cpumask - Set CPU affinity
+ * @set_cpumask: Set CPU affinity
* @p: task to set CPU affinity for
* @cpumask: cpumask of cpus that @p can run on
*
@@ -415,8 +429,8 @@ struct sched_ext_ops {
const struct cpumask *cpumask);
/**
- * update_idle - Update the idle state of a CPU
- * @cpu: CPU to udpate the idle state for
+ * @update_idle: Update the idle state of a CPU
+ * @cpu: CPU to update the idle state for
* @idle: whether entering or exiting the idle state
*
* This operation is called when @rq's CPU goes or leaves the idle
@@ -436,7 +450,7 @@ struct sched_ext_ops {
void (*update_idle)(s32 cpu, bool idle);
/**
- * cpu_acquire - A CPU is becoming available to the BPF scheduler
+ * @cpu_acquire: A CPU is becoming available to the BPF scheduler
* @cpu: The CPU being acquired by the BPF scheduler.
* @args: Acquire arguments, see the struct definition.
*
@@ -446,7 +460,7 @@ struct sched_ext_ops {
void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args);
/**
- * cpu_release - A CPU is taken away from the BPF scheduler
+ * @cpu_release: A CPU is taken away from the BPF scheduler
* @cpu: The CPU being released by the BPF scheduler.
* @args: Release arguments, see the struct definition.
*
@@ -458,7 +472,7 @@ struct sched_ext_ops {
void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args);
/**
- * init_task - Initialize a task to run in a BPF scheduler
+ * @init_task: Initialize a task to run in a BPF scheduler
* @p: task to initialize for BPF scheduling
* @args: init arguments, see the struct definition
*
@@ -473,8 +487,9 @@ struct sched_ext_ops {
s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
/**
- * exit_task - Exit a previously-running task from the system
+ * @exit_task: Exit a previously-running task from the system
* @p: task to exit
+ * @args: exit arguments, see the struct definition
*
* @p is exiting or the BPF scheduler is being unloaded. Perform any
* necessary cleanup for @p.
@@ -482,7 +497,7 @@ struct sched_ext_ops {
void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
/**
- * enable - Enable BPF scheduling for a task
+ * @enable: Enable BPF scheduling for a task
* @p: task to enable BPF scheduling for
*
* Enable @p for BPF scheduling. enable() is called on @p any time it
@@ -491,7 +506,7 @@ struct sched_ext_ops {
void (*enable)(struct task_struct *p);
/**
- * disable - Disable BPF scheduling for a task
+ * @disable: Disable BPF scheduling for a task
* @p: task to disable BPF scheduling for
*
* @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
@@ -501,7 +516,7 @@ struct sched_ext_ops {
void (*disable)(struct task_struct *p);
/**
- * dump - Dump BPF scheduler state on error
+ * @dump: Dump BPF scheduler state on error
* @ctx: debug dump context
*
* Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
@@ -509,7 +524,7 @@ struct sched_ext_ops {
void (*dump)(struct scx_dump_ctx *ctx);
/**
- * dump_cpu - Dump BPF scheduler state for a CPU on error
+ * @dump_cpu: Dump BPF scheduler state for a CPU on error
* @ctx: debug dump context
* @cpu: CPU to generate debug dump for
* @idle: @cpu is currently idle without any runnable tasks
@@ -521,7 +536,7 @@ struct sched_ext_ops {
void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
/**
- * dump_task - Dump BPF scheduler state for a runnable task on error
+ * @dump_task: Dump BPF scheduler state for a runnable task on error
* @ctx: debug dump context
* @p: runnable task to generate debug dump for
*
@@ -532,7 +547,7 @@ struct sched_ext_ops {
#ifdef CONFIG_EXT_GROUP_SCHED
/**
- * cgroup_init - Initialize a cgroup
+ * @cgroup_init: Initialize a cgroup
* @cgrp: cgroup being initialized
* @args: init arguments, see the struct definition
*
@@ -547,7 +562,7 @@ struct sched_ext_ops {
struct scx_cgroup_init_args *args);
/**
- * cgroup_exit - Exit a cgroup
+ * @cgroup_exit: Exit a cgroup
* @cgrp: cgroup being exited
*
* Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
@@ -556,7 +571,7 @@ struct sched_ext_ops {
void (*cgroup_exit)(struct cgroup *cgrp);
/**
- * cgroup_prep_move - Prepare a task to be moved to a different cgroup
+ * @cgroup_prep_move: Prepare a task to be moved to a different cgroup
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
@@ -571,7 +586,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_move - Commit cgroup move
+ * @cgroup_move: Commit cgroup move
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
@@ -582,7 +597,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_cancel_move - Cancel cgroup move
+ * @cgroup_cancel_move: Cancel cgroup move
* @p: task whose cgroup move is being canceled
* @from: cgroup @p was being moved from
* @to: cgroup @p was being moved to
@@ -594,7 +609,7 @@ struct sched_ext_ops {
struct cgroup *from, struct cgroup *to);
/**
- * cgroup_set_weight - A cgroup's weight is being changed
+ * @cgroup_set_weight: A cgroup's weight is being changed
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
@@ -608,7 +623,7 @@ struct sched_ext_ops {
*/
/**
- * cpu_online - A CPU became online
+ * @cpu_online: A CPU became online
* @cpu: CPU which just came up
*
* @cpu just came online. @cpu will not call ops.enqueue() or
@@ -617,7 +632,7 @@ struct sched_ext_ops {
void (*cpu_online)(s32 cpu);
/**
- * cpu_offline - A CPU is going offline
+ * @cpu_offline: A CPU is going offline
* @cpu: CPU which is going offline
*
* @cpu is going offline. @cpu will not call ops.enqueue() or
@@ -630,12 +645,12 @@ struct sched_ext_ops {
*/
/**
- * init - Initialize the BPF scheduler
+ * @init: Initialize the BPF scheduler
*/
s32 (*init)(void);
/**
- * exit - Clean up after the BPF scheduler
+ * @exit: Clean up after the BPF scheduler
* @info: Exit info
*
* ops.exit() is also called on ops.init() failure, which is a bit
@@ -645,17 +660,17 @@ struct sched_ext_ops {
void (*exit)(struct scx_exit_info *info);
/**
- * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
+ * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch
*/
u32 dispatch_max_batch;
/**
- * flags - %SCX_OPS_* flags
+ * @flags: %SCX_OPS_* flags
*/
u64 flags;
/**
- * timeout_ms - The maximum amount of time, in milliseconds, that a
+ * @timeout_ms: The maximum amount of time, in milliseconds, that a
* runnable task should be able to wait before being scheduled. The
* maximum timeout may not exceed the default timeout of 30 seconds.
*
@@ -664,13 +679,13 @@ struct sched_ext_ops {
u32 timeout_ms;
/**
- * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
+ * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default
* value of 32768 is used.
*/
u32 exit_dump_len;
/**
- * hotplug_seq - A sequence number that may be set by the scheduler to
+ * @hotplug_seq: A sequence number that may be set by the scheduler to
* detect when a hotplug event has occurred during the loading process.
* If 0, no detection occurs. Otherwise, the scheduler will fail to
* load if the sequence number does not match @scx_hotplug_seq on the
@@ -679,7 +694,7 @@ struct sched_ext_ops {
u64 hotplug_seq;
/**
- * name - BPF scheduler's name
+ * @name: BPF scheduler's name
*
* Must be a non-zero valid BPF object name including only isalnum(),
* '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
@@ -881,6 +896,7 @@ static bool scx_warned_zero_slice;
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
@@ -960,7 +976,7 @@ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
static struct scx_dispatch_q **global_dsqs;
static const struct rhashtable_params dsq_hash_params = {
- .key_len = 8,
+ .key_len = sizeof_field(struct scx_dispatch_q, id),
.key_offset = offsetof(struct scx_dispatch_q, id),
.head_offset = offsetof(struct scx_dispatch_q, hash_node),
};
@@ -1213,7 +1229,7 @@ static bool scx_kf_allowed_if_unlocked(void)
/**
* nldsq_next_task - Iterate to the next task in a non-local DSQ
- * @dsq: user dsq being interated
+ * @dsq: user dsq being iterated
* @cur: current position, %NULL to start iteration
* @rev: walk backwards
*
@@ -1408,7 +1424,6 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
/**
* scx_task_iter_next_locked - Next non-idle task with its rq locked
* @iter: iterator to walk
- * @include_dead: Whether we should include dead tasks in the iteration
*
* Visit the non-idle task with its rq lock held. Allows callers to specify
* whether they would like to filter out dead tasks. See scx_task_iter_start()
@@ -2014,6 +2029,11 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
unlikely(p->flags & PF_EXITING))
goto local;
+ /* see %SCX_OPS_ENQ_MIGRATION_DISABLED */
+ if (!static_branch_unlikely(&scx_ops_enq_migration_disabled) &&
+ is_migration_disabled(p))
+ goto local;
+
if (!SCX_HAS_OP(enqueue))
goto global;
@@ -2078,7 +2098,7 @@ static void set_task_runnable(struct rq *rq, struct task_struct *p)
/*
* list_add_tail() must be used. scx_ops_bypass() depends on tasks being
- * appened to the runnable_list.
+ * appended to the runnable_list.
*/
list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
}
@@ -2313,12 +2333,35 @@ static void move_remote_task_to_local_dsq(struct task_struct *p, u64 enq_flags,
*
* - The BPF scheduler is bypassed while the rq is offline and we can always say
* no to the BPF scheduler initiated migrations while offline.
+ *
+ * The caller must ensure that @p and @rq are on different CPUs.
*/
static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
bool trigger_error)
{
int cpu = cpu_of(rq);
+ SCHED_WARN_ON(task_cpu(p) == cpu);
+
+ /*
+ * If @p has migration disabled, @p->cpus_ptr is updated to contain only
+ * the pinned CPU in migrate_disable_switch() while @p is being switched
+ * out. However, put_prev_task_scx() is called before @p->cpus_ptr is
+ * updated and thus another CPU may see @p on a DSQ inbetween leading to
+ * @p passing the below task_allowed_on_cpu() check while migration is
+ * disabled.
+ *
+ * Test the migration disabled state first as the race window is narrow
+ * and the BPF scheduler failing to check migration disabled state can
+ * easily be masked if task_allowed_on_cpu() is done first.
+ */
+ if (unlikely(is_migration_disabled(p))) {
+ if (trigger_error)
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] cannot move migration disabled %s[%d] from CPU %d to %d",
+ p->comm, p->pid, task_cpu(p), cpu);
+ return false;
+ }
+
/*
* We don't require the BPF scheduler to avoid dispatching to offline
* CPUs mostly for convenience but also because CPUs can go offline
@@ -2327,14 +2370,11 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
*/
if (!task_allowed_on_cpu(p, cpu)) {
if (trigger_error)
- scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
- cpu_of(rq), p->comm, p->pid);
+ scx_ops_error("SCX_DSQ_LOCAL[_ON] target CPU %d not allowed for %s[%d]",
+ cpu, p->comm, p->pid);
return false;
}
- if (unlikely(is_migration_disabled(p)))
- return false;
-
if (!scx_rq_online(rq))
return false;
@@ -2437,7 +2477,8 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
if (dst_dsq->id == SCX_DSQ_LOCAL) {
dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
- if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dst_dsq = find_global_dsq(p);
dst_rq = src_rq;
}
@@ -2480,7 +2521,7 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
/*
* A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
* banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artifical delays while the
+ * to the bypass mode can take a long time. Inject artificial delays while the
* bypass mode is switching to guarantee timely completion.
*/
static void scx_ops_breather(struct rq *rq)
@@ -2575,6 +2616,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
{
struct rq *src_rq = task_rq(p);
struct rq *dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+#ifdef CONFIG_SMP
+ struct rq *locked_rq = rq;
+#endif
/*
* We're synchronized against dequeue through DISPATCHING. As @p can't
@@ -2588,7 +2632,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
#ifdef CONFIG_SMP
- if (unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
+ if (src_rq != dst_rq &&
+ unlikely(!task_can_run_on_remote_rq(p, dst_rq, true))) {
dispatch_enqueue(find_global_dsq(p), p,
enq_flags | SCX_ENQ_CLEAR_OPSS);
return;
@@ -2611,8 +2656,9 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
/* switch to @src_rq lock */
- if (rq != src_rq) {
- raw_spin_rq_unlock(rq);
+ if (locked_rq != src_rq) {
+ raw_spin_rq_unlock(locked_rq);
+ locked_rq = src_rq;
raw_spin_rq_lock(src_rq);
}
@@ -2630,6 +2676,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
} else {
move_remote_task_to_local_dsq(p, enq_flags,
src_rq, dst_rq);
+ /* task has been moved to dst_rq, which is now locked */
+ locked_rq = dst_rq;
}
/* if the destination CPU is idle, wake it up */
@@ -2638,8 +2686,8 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
}
/* switch back to @rq lock */
- if (rq != dst_rq) {
- raw_spin_rq_unlock(dst_rq);
+ if (locked_rq != rq) {
+ raw_spin_rq_unlock(locked_rq);
raw_spin_rq_lock(rq);
}
#else /* CONFIG_SMP */
@@ -2747,6 +2795,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
{
struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
bool prev_on_scx = prev->sched_class == &ext_sched_class;
+ bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED;
int nr_loops = SCX_DSP_MAX_LOOPS;
lockdep_assert_rq_held(rq);
@@ -2779,8 +2828,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
* See scx_ops_disable_workfn() for the explanation on the
* bypassing test.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- prev->scx.slice && !scx_rq_bypassing(rq)) {
+ if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
}
@@ -2813,6 +2861,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
flush_dispatch_buf(rq);
+ if (prev_on_rq && prev->scx.slice) {
+ rq->scx.flags |= SCX_RQ_BAL_KEEP;
+ goto has_tasks;
+ }
if (rq->scx.local_dsq.nr)
goto has_tasks;
if (consume_global_dsq(rq))
@@ -2838,8 +2890,7 @@ no_tasks:
* Didn't find another task to run. Keep running @prev unless
* %SCX_OPS_ENQ_LAST is in effect.
*/
- if ((prev->scx.flags & SCX_TASK_QUEUED) &&
- (!static_branch_unlikely(&scx_ops_enq_last) ||
+ if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) ||
scx_rq_bypassing(rq))) {
rq->scx.flags |= SCX_RQ_BAL_KEEP;
goto has_tasks;
@@ -3034,7 +3085,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
*/
if (p->scx.slice && !scx_rq_bypassing(rq)) {
dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
- return;
+ goto switch_class;
}
/*
@@ -3051,6 +3102,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
}
}
+switch_class:
if (next && next->sched_class != &ext_sched_class)
switch_class(rq, next);
}
@@ -3132,6 +3184,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* scx_prio_less - Task ordering for core-sched
* @a: task A
* @b: task B
+ * @in_fi: in forced idle state
*
* Core-sched is implemented as an additional scheduling layer on top of the
* usual sched_class'es and needs to find out the expected task ordering. For
@@ -3139,7 +3192,7 @@ static struct task_struct *pick_task_scx(struct rq *rq)
*
* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
* to implement the default task ordering. The older the timestamp, the higher
- * prority the task - the global FIFO ordering matching the default scheduling
+ * priority the task - the global FIFO ordering matching the default scheduling
* behavior.
*
* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
@@ -3180,6 +3233,10 @@ static bool test_and_clear_cpu_idle(int cpu)
* scx_pick_idle_cpu() can get caught in an infinite loop as
* @cpu is never cleared from idle_masks.smt. Ensure that @cpu
* is eventually cleared.
+ *
+ * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
+ * reduce memory writes, which may help alleviate cache
+ * coherence pressure.
*/
if (cpumask_intersects(smt, idle_masks.smt))
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
@@ -3216,6 +3273,74 @@ found:
}
/*
+ * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
+ * domain is not defined).
+ */
+static unsigned int llc_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sd->span_weight;
+}
+
+/*
+ * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
+ * domain is not defined).
+ */
+static struct cpumask *llc_span(s32 cpu)
+{
+ struct sched_domain *sd;
+
+ sd = rcu_dereference(per_cpu(sd_llc, cpu));
+ if (!sd)
+ return 0;
+
+ return sched_domain_span(sd);
+}
+
+/*
+ * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
+ * NUMA domain is not defined).
+ */
+static unsigned int numa_weight(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return 0;
+ sg = sd->groups;
+ if (!sg)
+ return 0;
+
+ return sg->group_weight;
+}
+
+/*
+ * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
+ * domain is not defined).
+ */
+static struct cpumask *numa_span(s32 cpu)
+{
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = rcu_dereference(per_cpu(sd_numa, cpu));
+ if (!sd)
+ return NULL;
+ sg = sd->groups;
+ if (!sg)
+ return NULL;
+
+ return sched_group_span(sg);
+}
+
+/*
* Return true if the LLC domains do not perfectly overlap with the NUMA
* domains, false otherwise.
*/
@@ -3246,19 +3371,10 @@ static bool llc_numa_mismatch(void)
* overlapping, which is incorrect (as NUMA 1 has two distinct LLC
* domains).
*/
- for_each_online_cpu(cpu) {
- const struct cpumask *numa_cpus;
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (!sd)
+ for_each_online_cpu(cpu)
+ if (llc_weight(cpu) != numa_weight(cpu))
return true;
- numa_cpus = cpumask_of_node(cpu_to_node(cpu));
- if (sd->span_weight != cpumask_weight(numa_cpus))
- return true;
- }
-
return false;
}
@@ -3276,8 +3392,7 @@ static bool llc_numa_mismatch(void)
static void update_selcpu_topology(void)
{
bool enable_llc = false, enable_numa = false;
- struct sched_domain *sd;
- const struct cpumask *cpus;
+ unsigned int nr_cpus;
s32 cpu = cpumask_first(cpu_online_mask);
/*
@@ -3291,10 +3406,12 @@ static void update_selcpu_topology(void)
* CPUs.
*/
rcu_read_lock();
- sd = rcu_dereference(per_cpu(sd_llc, cpu));
- if (sd) {
- if (sd->span_weight < num_online_cpus())
+ nr_cpus = llc_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus())
enable_llc = true;
+ pr_debug("sched_ext: LLC=%*pb weight=%u\n",
+ cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
}
/*
@@ -3306,15 +3423,19 @@ static void update_selcpu_topology(void)
* enabling both NUMA and LLC optimizations is unnecessary, as checking
* for an idle CPU in the same domain twice is redundant.
*/
- cpus = cpumask_of_node(cpu_to_node(cpu));
- if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch())
- enable_numa = true;
+ nr_cpus = numa_weight(cpu);
+ if (nr_cpus > 0) {
+ if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
+ enable_numa = true;
+ pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
+ cpumask_pr_args(numa_span(cpu)), numa_weight(cpu));
+ }
rcu_read_unlock();
pr_debug("sched_ext: LLC idle selection %s\n",
- enable_llc ? "enabled" : "disabled");
+ str_enabled_disabled(enable_llc));
pr_debug("sched_ext: NUMA idle selection %s\n",
- enable_numa ? "enabled" : "disabled");
+ str_enabled_disabled(enable_numa));
if (enable_llc)
static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
@@ -3344,6 +3465,8 @@ static void update_selcpu_topology(void)
* 4. Pick a CPU within the same NUMA node, if enabled:
* - choose a CPU from the same NUMA node to reduce memory access latency.
*
+ * 5. Pick any idle CPU usable by the task.
+ *
* Step 3 and 4 are performed only if the system has, respectively, multiple
* LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
* scx_selcpu_topo_numa).
@@ -3360,7 +3483,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
*found = false;
-
/*
* This is necessary to protect llc_cpus.
*/
@@ -3379,15 +3501,10 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
*/
if (p->nr_cpus_allowed >= num_possible_cpus()) {
if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
- numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+ numa_cpus = numa_span(prev_cpu);
- if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
- struct sched_domain *sd;
-
- sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
- if (sd)
- llc_cpus = sched_domain_span(sd);
- }
+ if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc))
+ llc_cpus = llc_span(prev_cpu);
}
/*
@@ -3586,20 +3703,9 @@ static void reset_idle_masks(void)
cpumask_copy(idle_masks.smt, cpu_online_mask);
}
-void __scx_update_idle(struct rq *rq, bool idle)
+static void update_builtin_idle(int cpu, bool idle)
{
- int cpu = cpu_of(rq);
-
- if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) {
- SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
- if (!static_branch_unlikely(&scx_builtin_idle_enabled))
- return;
- }
-
- if (idle)
- cpumask_set_cpu(cpu, idle_masks.cpu);
- else
- cpumask_clear_cpu(cpu, idle_masks.cpu);
+ assign_cpu(cpu, idle_masks.cpu, idle);
#ifdef CONFIG_SCHED_SMT
if (sched_smt_active()) {
@@ -3610,10 +3716,8 @@ void __scx_update_idle(struct rq *rq, bool idle)
* idle_masks.smt handling is racy but that's fine as
* it's only for optimization and self-correcting.
*/
- for_each_cpu(cpu, smt) {
- if (!cpumask_test_cpu(cpu, idle_masks.cpu))
- return;
- }
+ if (!cpumask_subset(smt, idle_masks.cpu))
+ return;
cpumask_or(idle_masks.smt, idle_masks.smt, smt);
} else {
cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
@@ -3622,6 +3726,57 @@ void __scx_update_idle(struct rq *rq, bool idle)
#endif
}
+/*
+ * Update the idle state of a CPU to @idle.
+ *
+ * If @do_notify is true, ops.update_idle() is invoked to notify the scx
+ * scheduler of an actual idle state transition (idle to busy or vice
+ * versa). If @do_notify is false, only the idle state in the idle masks is
+ * refreshed without invoking ops.update_idle().
+ *
+ * This distinction is necessary, because an idle CPU can be "reserved" and
+ * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
+ * busy even if no tasks are dispatched. In this case, the CPU may return
+ * to idle without a true state transition. Refreshing the idle masks
+ * without invoking ops.update_idle() ensures accurate idle state tracking
+ * while avoiding unnecessary updates and maintaining balanced state
+ * transitions.
+ */
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
+{
+ int cpu = cpu_of(rq);
+
+ lockdep_assert_rq_held(rq);
+
+ /*
+ * Trigger ops.update_idle() only when transitioning from a task to
+ * the idle thread and vice versa.
+ *
+ * Idle transitions are indicated by do_notify being set to true,
+ * managed by put_prev_task_idle()/set_next_task_idle().
+ */
+ if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq))
+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
+
+ /*
+ * Update the idle masks:
+ * - for real idle transitions (do_notify == true)
+ * - for idle-to-idle transitions (indicated by the previous task
+ * being the idle thread, managed by pick_task_idle())
+ *
+ * Skip updating idle masks if the previous task is not the idle
+ * thread, since set_next_task_idle() has already handled it when
+ * transitioning from a task to the idle thread (calling this
+ * function with do_notify == true).
+ *
+ * In this way we can avoid updating the idle masks twice,
+ * unnecessarily.
+ */
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ if (do_notify || is_idle_task(rq->curr))
+ update_builtin_idle(cpu, idle);
+}
+
static void handle_hotplug(struct rq *rq, bool online)
{
int cpu = cpu_of(rq);
@@ -3744,7 +3899,7 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
curr->scx.slice = 0;
touch_core_sched(rq, curr);
} else if (SCX_HAS_OP(tick)) {
- SCX_CALL_OP(SCX_KF_REST, tick, curr);
+ SCX_CALL_OP_TASK(SCX_KF_REST, tick, curr);
}
if (!curr->scx.slice)
@@ -3891,7 +4046,7 @@ static void scx_ops_disable_task(struct task_struct *p)
WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
if (SCX_HAS_OP(disable))
- SCX_CALL_OP(SCX_KF_REST, disable, p);
+ SCX_CALL_OP_TASK(SCX_KF_REST, disable, p);
scx_set_task_state(p, SCX_TASK_READY);
}
@@ -3920,7 +4075,7 @@ static void scx_ops_exit_task(struct task_struct *p)
}
if (SCX_HAS_OP(exit_task))
- SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
+ SCX_CALL_OP_TASK(SCX_KF_REST, exit_task, p, &args);
scx_set_task_state(p, SCX_TASK_NONE);
}
@@ -4216,25 +4371,12 @@ err:
return ops_sanitize_err("cgroup_prep_move", ret);
}
-void scx_move_task(struct task_struct *p)
+void scx_cgroup_move_task(struct task_struct *p)
{
if (!scx_cgroup_enabled)
return;
/*
- * We're called from sched_move_task() which handles both cgroup and
- * autogroup moves. Ignore the latter.
- *
- * Also ignore exiting tasks, because in the exit path tasks transition
- * from the autogroup to the root group, so task_group_is_autogroup()
- * alone isn't able to catch exiting autogroup tasks. This is safe for
- * cgroup_move(), because cgroup migrations never happen for PF_EXITING
- * tasks.
- */
- if (task_group_is_autogroup(task_group(p)) || (p->flags & PF_EXITING))
- return;
-
- /*
* @p must have ops.cgroup_prep_move() called on it and thus
* cgrp_moving_from set.
*/
@@ -4483,7 +4625,7 @@ static int scx_cgroup_init(void)
cgroup_warned_missing_idle = false;
/*
- * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk
+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk
* cgroups and init, all online cgroups are initialized.
*/
rcu_read_lock();
@@ -4641,6 +4783,7 @@ bool task_should_scx(int policy)
/**
* scx_softlockup - sched_ext softlockup handler
+ * @dur_s: number of seconds of CPU stuck due to soft lockup
*
* On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
* live-lock the system by making many CPUs target the same DSQ to the point
@@ -4684,6 +4827,7 @@ static void scx_clear_softlockup(void)
/**
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
+ * @bypass: true for bypass, false for unbypass
*
* Bypassing guarantees that all runnable tasks make forward progress without
* trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
@@ -4744,10 +4888,9 @@ static void scx_ops_bypass(bool bypass)
*/
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
- struct rq_flags rf;
struct task_struct *p, *n;
- rq_lock(rq, &rf);
+ raw_spin_rq_lock(rq);
if (bypass) {
WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING);
@@ -4763,7 +4906,7 @@ static void scx_ops_bypass(bool bypass)
* sees scx_rq_bypassing() before moving tasks to SCX.
*/
if (!scx_enabled()) {
- rq_unlock_irqrestore(rq, &rf);
+ raw_spin_rq_unlock(rq);
continue;
}
@@ -4783,10 +4926,11 @@ static void scx_ops_bypass(bool bypass)
sched_enq_and_set_task(&ctx);
}
- rq_unlock(rq, &rf);
-
/* resched to restore ticks and idle state */
- resched_cpu(cpu);
+ if (cpu_online(cpu) || cpu == smp_processor_id())
+ resched_curr(rq);
+
+ raw_spin_rq_unlock(rq);
}
atomic_dec(&scx_ops_breather_depth);
@@ -4852,7 +4996,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct task_struct *p;
struct rhashtable_iter rht_iter;
struct scx_dispatch_q *dsq;
- int i, kind;
+ int i, kind, cpu;
kind = atomic_read(&scx_exit_kind);
while (true) {
@@ -4935,12 +5079,22 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
scx_task_iter_stop(&sti);
percpu_up_write(&scx_fork_rwsem);
+ /*
+ * Invalidate all the rq clocks to prevent getting outdated
+ * rq clocks from a previous scx scheduler.
+ */
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ scx_rq_clock_invalidate(rq);
+ }
+
/* no task is on scx, turn off all the switches and flush in-progress calls */
static_branch_disable(&__scx_ops_enabled);
for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
static_branch_disable(&scx_has_op[i]);
static_branch_disable(&scx_ops_enq_last);
static_branch_disable(&scx_ops_enq_exiting);
+ static_branch_disable(&scx_ops_enq_migration_disabled);
static_branch_disable(&scx_ops_cpu_preempt);
static_branch_disable(&scx_builtin_idle_enabled);
synchronize_rcu();
@@ -5159,9 +5313,10 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK,
ops_state >> SCX_OPSS_QSEQ_SHIFT);
- dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu",
- p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf,
- p->scx.dsq_vtime);
+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s",
+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
+ dump_line(s, " dsq_vtime=%llu slice=%llu weight=%u",
+ p->scx.dsq_vtime, p->scx.slice, p->scx.weight);
dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
if (SCX_HAS_OP(dump_task)) {
@@ -5352,7 +5507,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name)
{
struct kthread_worker *helper;
- helper = kthread_create_worker(0, name);
+ helper = kthread_run_worker(0, name);
if (helper)
sched_set_fifo(helper->task);
return helper;
@@ -5549,6 +5704,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
if (ops->flags & SCX_OPS_ENQ_EXITING)
static_branch_enable(&scx_ops_enq_exiting);
+ if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
+ static_branch_enable(&scx_ops_enq_migration_disabled);
if (scx_ops.cpu_acquire || scx_ops.cpu_release)
static_branch_enable(&scx_ops_cpu_preempt);
@@ -6236,6 +6393,15 @@ void __init init_sched_ext_class(void)
__bpf_kfunc_start_defs();
+static bool check_builtin_idle_enabled(void)
+{
+ if (static_branch_likely(&scx_builtin_idle_enabled))
+ return true;
+
+ scx_ops_error("built-in idle tracking is disabled");
+ return false;
+}
+
/**
* scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
* @p: task_struct to select a CPU for
@@ -6253,10 +6419,8 @@ __bpf_kfunc_start_defs();
__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
u64 wake_flags, bool *is_idle)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
goto prev_cpu;
- }
if (!scx_kf_allowed(SCX_KF_SELECT_CPU))
goto prev_cpu;
@@ -6340,9 +6504,7 @@ __bpf_kfunc_start_defs();
* ops.select_cpu(), and ops.dispatch().
*
* When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
- * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
- * used to target the local DSQ of a CPU other than the enqueueing one. Use
- * ops.select_cpu() to be on the target CPU in the first place.
+ * and @p must match the task being enqueued.
*
* When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
* will be directly inserted into the corresponding dispatch queue after
@@ -7013,7 +7175,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id,
return -ENOENT;
INIT_LIST_HEAD(&kit->cursor.node);
- kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags;
+ kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags;
kit->cursor.priv = READ_ONCE(kit->dsq->seq);
return 0;
@@ -7181,7 +7343,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
}
/**
- * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
+ * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler
* @fmt: format string
* @data: format string parameters packaged using ___bpf_fill() macro
* @data__sz: @data len, must end in '__sz' for the verifier
@@ -7273,7 +7435,6 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu)
* scx_bpf_cpuperf_set - Set the relative performance target of a CPU
* @cpu: CPU of interest
* @perf: target performance level [0, %SCX_CPUPERF_ONE]
- * @flags: %SCX_CPUPERF_* flags
*
* Set the target performance level of @cpu to @perf. @perf is in linear
* relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the
@@ -7350,10 +7511,8 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return cpu_none_mask;
- }
#ifdef CONFIG_SMP
return idle_masks.cpu;
@@ -7371,10 +7530,8 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
*/
__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return cpu_none_mask;
- }
#ifdef CONFIG_SMP
if (sched_smt_active())
@@ -7389,6 +7546,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
/**
* scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
* either the percpu, or SMT idle-tracking cpumask.
+ * @idle_mask: &cpumask to use
*/
__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
{
@@ -7412,10 +7570,8 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
*/
__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return false;
- }
if (ops_cpu_valid(cpu, NULL))
return test_and_clear_cpu_idle(cpu);
@@ -7445,10 +7601,8 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
u64 flags)
{
- if (!static_branch_likely(&scx_builtin_idle_enabled)) {
- scx_ops_error("built-in idle tracking is disabled");
+ if (!check_builtin_idle_enabled())
return -EBUSY;
- }
return scx_pick_idle_cpu(cpus_allowed, flags);
}
@@ -7543,6 +7697,68 @@ out:
}
#endif
+/**
+ * scx_bpf_now - Returns a high-performance monotonically non-decreasing
+ * clock for the current CPU. The clock returned is in nanoseconds.
+ *
+ * It provides the following properties:
+ *
+ * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently
+ * to account for execution time and track tasks' runtime properties.
+ * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which
+ * eventually reads a hardware timestamp counter -- is neither performant nor
+ * scalable. scx_bpf_now() aims to provide a high-performance clock by
+ * using the rq clock in the scheduler core whenever possible.
+ *
+ * 2) High enough resolution for the BPF scheduler use cases: In most BPF
+ * scheduler use cases, the required clock resolution is lower than the most
+ * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically
+ * uses the rq clock in the scheduler core whenever it is valid. It considers
+ * that the rq clock is valid from the time the rq clock is updated
+ * (update_rq_clock) until the rq is unlocked (rq_unpin_lock).
+ *
+ * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now()
+ * guarantees the clock never goes backward when comparing them in the same
+ * CPU. On the other hand, when comparing clocks in different CPUs, there
+ * is no such guarantee -- the clock can go backward. It provides a
+ * monotonically *non-decreasing* clock so that it would provide the same
+ * clock values in two different scx_bpf_now() calls in the same CPU
+ * during the same period of when the rq clock is valid.
+ */
+__bpf_kfunc u64 scx_bpf_now(void)
+{
+ struct rq *rq;
+ u64 clock;
+
+ preempt_disable();
+
+ rq = this_rq();
+ if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
+ /*
+ * If the rq clock is valid, use the cached rq clock.
+ *
+ * Note that scx_bpf_now() is re-entrant between a process
+ * context and an interrupt context (e.g., timer interrupt).
+ * However, we don't need to consider the race between them
+ * because such race is not observable from a caller.
+ */
+ clock = READ_ONCE(rq->scx.clock);
+ } else {
+ /*
+ * Otherwise, return a fresh rq clock.
+ *
+ * The rq clock is updated outside of the rq lock.
+ * In this case, keep the updated rq clock invalid so the next
+ * kfunc call outside the rq lock gets a fresh rq clock.
+ */
+ clock = sched_clock_cpu(cpu_of(rq));
+ }
+
+ preempt_enable();
+
+ return clock;
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(scx_kfunc_ids_any)
@@ -7574,6 +7790,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq)
#ifdef CONFIG_CGROUP_SCHED
BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE)
#endif
+BTF_ID_FLAGS(func, scx_bpf_now)
BTF_KFUNCS_END(scx_kfunc_ids_any)
static const struct btf_kfunc_id_set scx_kfunc_set_any = {
diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
index b1675bb59fc46..1079b56b0f7ae 100644
--- a/kernel/sched/ext.h
+++ b/kernel/sched/ext.h
@@ -57,15 +57,15 @@ static inline void init_sched_ext_class(void) {}
#endif /* CONFIG_SCHED_CLASS_EXT */
#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP)
-void __scx_update_idle(struct rq *rq, bool idle);
+void __scx_update_idle(struct rq *rq, bool idle, bool do_notify);
-static inline void scx_update_idle(struct rq *rq, bool idle)
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify)
{
if (scx_enabled())
- __scx_update_idle(rq, idle);
+ __scx_update_idle(rq, idle, do_notify);
}
#else
-static inline void scx_update_idle(struct rq *rq, bool idle) {}
+static inline void scx_update_idle(struct rq *rq, bool idle, bool do_notify) {}
#endif
#ifdef CONFIG_CGROUP_SCHED
@@ -73,7 +73,7 @@ static inline void scx_update_idle(struct rq *rq, bool idle) {}
int scx_tg_online(struct task_group *tg);
void scx_tg_offline(struct task_group *tg);
int scx_cgroup_can_attach(struct cgroup_taskset *tset);
-void scx_move_task(struct task_struct *p);
+void scx_cgroup_move_task(struct task_struct *p);
void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
@@ -82,7 +82,7 @@ void scx_group_set_idle(struct task_group *tg, bool idle);
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; }
-static inline void scx_move_task(struct task_struct *p) {}
+static inline void scx_cgroup_move_task(struct task_struct *p) {}
static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e9ca38512dee..1c0ef435a7aae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,6 +37,7 @@
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
#include <linux/sched/nohz.h>
+#include <linux/sched/prio.h>
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
@@ -51,6 +52,8 @@
#include <asm/switch_to.h>
+#include <uapi/linux/sched/types.h>
+
#include "sched.h"
#include "stats.h"
#include "autogroup.h"
@@ -130,7 +133,7 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
#endif
#ifdef CONFIG_SYSCTL
-static struct ctl_table sched_fair_sysctls[] = {
+static const struct ctl_table sched_fair_sysctls[] = {
#ifdef CONFIG_CFS_BANDWIDTH
{
.procname = "sched_cfs_bandwidth_slice_us",
@@ -523,7 +526,7 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
* Scheduling class tree data structure manipulation methods:
*/
-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+static inline __maybe_unused u64 max_vruntime(u64 max_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - max_vruntime);
if (delta > 0)
@@ -532,7 +535,7 @@ static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
return max_vruntime;
}
-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+static inline __maybe_unused u64 min_vruntime(u64 min_vruntime, u64 vruntime)
{
s64 delta = (s64)(vruntime - min_vruntime);
if (delta < 0)
@@ -689,21 +692,16 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq)
*
* XXX could add max_slice to the augmented data to track this.
*/
-static s64 entity_lag(u64 avruntime, struct sched_entity *se)
+static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
s64 vlag, limit;
- vlag = avruntime - se->vruntime;
- limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
-
- return clamp(vlag, -limit, limit);
-}
-
-static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
SCHED_WARN_ON(!se->on_rq);
- se->vlag = entity_lag(avg_vruntime(cfs_rq), se);
+ vlag = avg_vruntime(cfs_rq) - se->vruntime;
+ limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se);
+
+ se->vlag = clamp(vlag, -limit, limit);
}
/*
@@ -915,7 +913,7 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
* We can safely skip eligibility check if there is only one entity
* in this cfs_rq, saving some cycles.
*/
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_queued == 1)
return curr && curr->on_rq ? curr : se;
if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
@@ -1250,7 +1248,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_queued == 1)
return;
if (resched || did_preempt_short(cfs_rq, curr)) {
@@ -2131,7 +2129,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->load += cpu_load(rq);
ns->runnable += cpu_runnable(rq);
ns->util += cpu_util_cfs(cpu);
- ns->nr_running += rq->cfs.h_nr_running;
+ ns->nr_running += rq->cfs.h_nr_runnable;
ns->compute_capacity += capacity_of(cpu);
if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
@@ -3682,9 +3680,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_add(&se->group_node, &rq->cfs_tasks);
}
#endif
- cfs_rq->nr_running++;
- if (se_is_idle(se))
- cfs_rq->idle_nr_running++;
+ cfs_rq->nr_queued++;
}
static void
@@ -3697,9 +3693,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
list_del_init(&se->group_node);
}
#endif
- cfs_rq->nr_running--;
- if (se_is_idle(se))
- cfs_rq->idle_nr_running--;
+ cfs_rq->nr_queued--;
}
/*
@@ -3774,137 +3768,32 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
-static void reweight_eevdf(struct sched_entity *se, u64 avruntime,
- unsigned long weight)
-{
- unsigned long old_weight = se->load.weight;
- s64 vlag, vslice;
-
- /*
- * VRUNTIME
- * --------
- *
- * COROLLARY #1: The virtual runtime of the entity needs to be
- * adjusted if re-weight at !0-lag point.
- *
- * Proof: For contradiction assume this is not true, so we can
- * re-weight without changing vruntime at !0-lag point.
- *
- * Weight VRuntime Avg-VRuntime
- * before w v V
- * after w' v' V'
- *
- * Since lag needs to be preserved through re-weight:
- *
- * lag = (V - v)*w = (V'- v')*w', where v = v'
- * ==> V' = (V - v)*w/w' + v (1)
- *
- * Let W be the total weight of the entities before reweight,
- * since V' is the new weighted average of entities:
- *
- * V' = (WV + w'v - wv) / (W + w' - w) (2)
- *
- * by using (1) & (2) we obtain:
- *
- * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
- * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
- * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
- * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
- *
- * Since we are doing at !0-lag point which means V != v, we
- * can simplify (3):
- *
- * ==> W / (W + w' - w) = w / w'
- * ==> Ww' = Ww + ww' - ww
- * ==> W * (w' - w) = w * (w' - w)
- * ==> W = w (re-weight indicates w' != w)
- *
- * So the cfs_rq contains only one entity, hence vruntime of
- * the entity @v should always equal to the cfs_rq's weighted
- * average vruntime @V, which means we will always re-weight
- * at 0-lag point, thus breach assumption. Proof completed.
- *
- *
- * COROLLARY #2: Re-weight does NOT affect weighted average
- * vruntime of all the entities.
- *
- * Proof: According to corollary #1, Eq. (1) should be:
- *
- * (V - v)*w = (V' - v')*w'
- * ==> v' = V' - (V - v)*w/w' (4)
- *
- * According to the weighted average formula, we have:
- *
- * V' = (WV - wv + w'v') / (W - w + w')
- * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
- * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
- * = (WV + w'V' - Vw) / (W - w + w')
- *
- * ==> V'*(W - w + w') = WV + w'V' - Vw
- * ==> V' * (W - w) = (W - w) * V (5)
- *
- * If the entity is the only one in the cfs_rq, then reweight
- * always occurs at 0-lag point, so V won't change. Or else
- * there are other entities, hence W != w, then Eq. (5) turns
- * into V' = V. So V won't change in either case, proof done.
- *
- *
- * So according to corollary #1 & #2, the effect of re-weight
- * on vruntime should be:
- *
- * v' = V' - (V - v) * w / w' (4)
- * = V - (V - v) * w / w'
- * = V - vl * w / w'
- * = V - vl'
- */
- if (avruntime != se->vruntime) {
- vlag = entity_lag(avruntime, se);
- vlag = div_s64(vlag * old_weight, weight);
- se->vruntime = avruntime - vlag;
- }
-
- /*
- * DEADLINE
- * --------
- *
- * When the weight changes, the virtual time slope changes and
- * we should adjust the relative virtual deadline accordingly.
- *
- * d' = v' + (d - v)*w/w'
- * = V' - (V - v)*w/w' + (d - v)*w/w'
- * = V - (V - v)*w/w' + (d - v)*w/w'
- * = V + (d - V)*w/w'
- */
- vslice = (s64)(se->deadline - avruntime);
- vslice = div_s64(vslice * old_weight, weight);
- se->deadline = avruntime + vslice;
-}
+static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags);
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
bool curr = cfs_rq->curr == se;
- u64 avruntime;
if (se->on_rq) {
/* commit outstanding execution time */
update_curr(cfs_rq);
- avruntime = avg_vruntime(cfs_rq);
+ update_entity_lag(cfs_rq, se);
+ se->deadline -= se->vruntime;
+ se->rel_deadline = 1;
if (!curr)
__dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- if (se->on_rq) {
- reweight_eevdf(se, avruntime, weight);
- } else {
- /*
- * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
- * we need to scale se->vlag when w_i changes.
- */
- se->vlag = div_s64(se->vlag * se->load.weight, weight);
- }
+ /*
+ * Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
+ * we need to scale se->vlag when w_i changes.
+ */
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
+ if (se->rel_deadline)
+ se->deadline = div_s64(se->deadline * se->load.weight, weight);
update_load_set(&se->load, weight);
@@ -3919,6 +3808,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
+ place_entity(cfs_rq, se, 0);
if (!curr)
__enqueue_entity(cfs_rq, se);
@@ -4065,7 +3955,11 @@ static void update_cfs_group(struct sched_entity *se)
struct cfs_rq *gcfs_rq = group_cfs_rq(se);
long shares;
- if (!gcfs_rq)
+ /*
+ * When a group becomes empty, preserve its weight. This matters for
+ * DELAY_DEQUEUE.
+ */
+ if (!gcfs_rq || !gcfs_rq->load.weight)
return;
if (throttled_hierarchy(gcfs_rq))
@@ -5233,7 +5127,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
{
- return !cfs_rq->nr_running;
+ return !cfs_rq->nr_queued;
}
#define UPDATE_TG 0x0
@@ -5271,6 +5165,22 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
+void __setparam_fair(struct task_struct *p, const struct sched_attr *attr)
+{
+ struct sched_entity *se = &p->se;
+
+ p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+ if (attr->sched_runtime) {
+ se->custom_slice = 1;
+ se->slice = clamp_t(u64, attr->sched_runtime,
+ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
+ NSEC_PER_MSEC*100); /* HZ=100 / 10 */
+ } else {
+ se->custom_slice = 0;
+ se->slice = sysctl_sched_base_slice;
+ }
+}
+
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -5289,7 +5199,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_queued && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@@ -5359,7 +5269,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->vruntime = vruntime - lag;
- if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) {
+ if (se->rel_deadline) {
se->deadline += se->vruntime;
se->rel_deadline = 0;
return;
@@ -5382,8 +5292,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
-static inline bool cfs_bandwidth_used(void);
-
static void
requeue_delayed_entity(struct sched_entity *se);
@@ -5405,7 +5313,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
@@ -5438,7 +5346,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- if (cfs_rq->nr_running == 1) {
+ if (cfs_rq->nr_queued == 1) {
check_enqueue_throttle(cfs_rq);
if (!throttled_hierarchy(cfs_rq)) {
list_add_leaf_cfs_rq(cfs_rq);
@@ -5477,10 +5385,19 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void set_delayed(struct sched_entity *se)
{
se->sched_delayed = 1;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since dequeue_entities()
+ * will account it for blocked tasks.
+ */
+ if (!entity_is_task(se))
+ return;
+
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_delayed++;
+ cfs_rq->h_nr_runnable--;
if (cfs_rq_throttled(cfs_rq))
break;
}
@@ -5489,10 +5406,20 @@ static void set_delayed(struct sched_entity *se)
static void clear_delayed(struct sched_entity *se)
{
se->sched_delayed = 0;
+
+ /*
+ * Delayed se of cfs_rq have no tasks queued on them.
+ * Do not adjust h_nr_runnable since a dequeue has
+ * already accounted for it or an enqueue of a task
+ * below it will account for it in enqueue_task_fair().
+ */
+ if (!entity_is_task(se))
+ return;
+
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- cfs_rq->h_nr_delayed--;
+ cfs_rq->h_nr_runnable++;
if (cfs_rq_throttled(cfs_rq))
break;
}
@@ -5509,6 +5436,7 @@ static bool
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool sleep = flags & DEQUEUE_SLEEP;
+ int action = UPDATE_TG;
update_curr(cfs_rq);
clear_buddies(cfs_rq, se);
@@ -5534,7 +5462,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
}
}
- int action = UPDATE_TG;
if (entity_is_task(se) && task_on_rq_migrating(task_of(se)))
action |= DO_DETACH;
@@ -5542,7 +5469,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
* - For group_entity, update its runnable_weight to reflect the new
- * h_nr_running of its group cfs_rq.
+ * h_nr_runnable of its group cfs_rq.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
@@ -5580,7 +5507,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & DEQUEUE_DELAYED)
finish_delayed_dequeue_entity(se);
- if (cfs_rq->nr_running == 0)
+ if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
return true;
@@ -5642,17 +5569,19 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags);
static struct sched_entity *
pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
{
+ struct sched_entity *se;
+
/*
- * Enabling NEXT_BUDDY will affect latency but not fairness.
+ * Picking the ->next buddy will affect latency but not fairness.
*/
- if (sched_feat(NEXT_BUDDY) &&
+ if (sched_feat(PICK_BUDDY) &&
cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) {
/* ->next will never be delayed */
SCHED_WARN_ON(cfs_rq->next->sched_delayed);
return cfs_rq->next;
}
- struct sched_entity *se = pick_eevdf(cfs_rq);
+ se = pick_eevdf(cfs_rq);
if (se->sched_delayed) {
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
/*
@@ -5928,7 +5857,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
list_del_leaf_cfs_rq(cfs_rq);
SCHED_WARN_ON(cfs_rq->throttled_clock_self);
- if (cfs_rq->nr_running)
+ if (cfs_rq->nr_queued)
cfs_rq->throttled_clock_self = rq_clock(rq);
}
cfs_rq->throttle_count++;
@@ -5941,8 +5870,8 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, delayed_delta, dequeue = 1;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ long queued_delta, runnable_delta, idle_delta, dequeue = 1;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
raw_spin_lock(&cfs_b->lock);
/* This will start the period timer if necessary */
@@ -5972,9 +5901,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
rcu_read_unlock();
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
- delayed_delta = cfs_rq->h_nr_delayed;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
+ idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
int flags;
@@ -5994,11 +5923,11 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, flags);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
- qcfs_rq->h_nr_delayed -= delayed_delta;
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
+ qcfs_rq->h_nr_idle -= idle_delta;
if (qcfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
@@ -6017,18 +5946,18 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running -= task_delta;
- qcfs_rq->idle_h_nr_running -= idle_task_delta;
- qcfs_rq->h_nr_delayed -= delayed_delta;
+ qcfs_rq->h_nr_queued -= queued_delta;
+ qcfs_rq->h_nr_runnable -= runnable_delta;
+ qcfs_rq->h_nr_idle -= idle_delta;
}
/* At this point se is NULL and we are at root level*/
- sub_nr_running(rq, task_delta);
+ sub_nr_running(rq, queued_delta);
/* Stop the fair server if throttling resulted in no runnable tasks */
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
done:
/*
@@ -6037,7 +5966,7 @@ done:
*/
cfs_rq->throttled = 1;
SCHED_WARN_ON(cfs_rq->throttled_clock);
- if (cfs_rq->nr_running)
+ if (cfs_rq->nr_queued)
cfs_rq->throttled_clock = rq_clock(rq);
return true;
}
@@ -6047,8 +5976,8 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
- long task_delta, idle_task_delta, delayed_delta;
- long rq_h_nr_running = rq->cfs.h_nr_running;
+ long queued_delta, runnable_delta, idle_delta;
+ long rq_h_nr_queued = rq->cfs.h_nr_queued;
se = cfs_rq->tg->se[cpu_of(rq)];
@@ -6081,9 +6010,9 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
goto unthrottle_throttle;
}
- task_delta = cfs_rq->h_nr_running;
- idle_task_delta = cfs_rq->idle_h_nr_running;
- delayed_delta = cfs_rq->h_nr_delayed;
+ queued_delta = cfs_rq->h_nr_queued;
+ runnable_delta = cfs_rq->h_nr_runnable;
+ idle_delta = cfs_rq->h_nr_idle;
for_each_sched_entity(se) {
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -6097,11 +6026,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
- qcfs_rq->h_nr_delayed += delayed_delta;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
+ qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@@ -6115,11 +6044,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
se_update_runnable(se);
if (cfs_rq_is_idle(group_cfs_rq(se)))
- idle_task_delta = cfs_rq->h_nr_running;
+ idle_delta = cfs_rq->h_nr_queued;
- qcfs_rq->h_nr_running += task_delta;
- qcfs_rq->idle_h_nr_running += idle_task_delta;
- qcfs_rq->h_nr_delayed += delayed_delta;
+ qcfs_rq->h_nr_queued += queued_delta;
+ qcfs_rq->h_nr_runnable += runnable_delta;
+ qcfs_rq->h_nr_idle += idle_delta;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
@@ -6127,17 +6056,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
}
/* Start the fair server if un-throttling resulted in new runnable tasks */
- if (!rq_h_nr_running && rq->cfs.h_nr_running)
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued)
dl_server_start(&rq->fair_server);
/* At this point se is NULL and we are at root level*/
- add_nr_running(rq, task_delta);
+ add_nr_running(rq, queued_delta);
unthrottle_throttle:
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
- if (rq->curr == rq->idle && rq->cfs.nr_running)
+ if (rq->curr == rq->idle && rq->cfs.nr_queued)
resched_curr(rq);
}
@@ -6438,7 +6367,7 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+ if (!cfs_rq->runtime_enabled || cfs_rq->nr_queued)
return;
__return_cfs_rq_runtime(cfs_rq);
@@ -6709,6 +6638,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_rq_held(rq);
+ // Do not unthrottle for an active CPU
+ if (cpumask_test_cpu(cpu_of(rq), cpu_active_mask))
+ return;
+
/*
* The rq clock has already been updated in the
* set_rq_offline(), so we should skip updating
@@ -6724,18 +6657,20 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
continue;
/*
- * clock_task is not advancing so we just need to make sure
- * there's some valid quota amount
- */
- cfs_rq->runtime_remaining = 1;
- /*
* Offline rq is schedulable till CPU is completely disabled
* in take_cpu_down(), so we prevent new cfs throttling here.
*/
cfs_rq->runtime_enabled = 0;
- if (cfs_rq_throttled(cfs_rq))
- unthrottle_cfs_rq(cfs_rq);
+ if (!cfs_rq_throttled(cfs_rq))
+ continue;
+
+ /*
+ * clock_task is not advancing so we just need to make sure
+ * there's some valid quota amount
+ */
+ cfs_rq->runtime_remaining = 1;
+ unthrottle_cfs_rq(cfs_rq);
}
rcu_read_unlock();
@@ -6784,11 +6719,6 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p)
#else /* CONFIG_CFS_BANDWIDTH */
-static inline bool cfs_bandwidth_used(void)
-{
- return false;
-}
-
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
@@ -6846,7 +6776,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
SCHED_WARN_ON(task_rq(p) != rq);
- if (rq->cfs.h_nr_running > 1) {
+ if (rq->cfs.h_nr_queued > 1) {
u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
u64 slice = se->slice;
s64 delta = slice - ran;
@@ -6934,7 +6864,7 @@ static inline void check_update_overutilized_status(struct rq *rq) { }
/* Runqueue only has SCHED_IDLE tasks enqueued */
static int sched_idle_rq(struct rq *rq)
{
- return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
+ return unlikely(rq->nr_running == rq->cfs.h_nr_idle &&
rq->nr_running);
}
@@ -6961,14 +6891,14 @@ requeue_delayed_entity(struct sched_entity *se)
if (sched_feat(DELAY_ZERO)) {
update_entity_lag(cfs_rq, se);
if (se->vlag > 0) {
- cfs_rq->nr_running--;
+ cfs_rq->nr_queued--;
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->vlag = 0;
place_entity(cfs_rq, se, 0);
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
- cfs_rq->nr_running++;
+ cfs_rq->nr_queued++;
}
}
@@ -6986,10 +6916,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
- int idle_h_nr_running = task_has_idle_policy(p);
- int h_nr_delayed = 0;
+ int h_nr_idle = task_has_idle_policy(p);
+ int h_nr_runnable = 1;
int task_new = !(flags & ENQUEUE_WAKEUP);
- int rq_h_nr_running = rq->cfs.h_nr_running;
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
u64 slice = 0;
/*
@@ -7014,8 +6944,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (p->in_iowait)
cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
- if (task_new)
- h_nr_delayed = !!se->sched_delayed;
+ if (task_new && se->sched_delayed)
+ h_nr_runnable = 0;
for_each_sched_entity(se) {
if (se->on_rq) {
@@ -7037,12 +6967,12 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
enqueue_entity(cfs_rq, se, flags);
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
- cfs_rq->h_nr_delayed += h_nr_delayed;
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@@ -7061,19 +6991,19 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running++;
- cfs_rq->idle_h_nr_running += idle_h_nr_running;
- cfs_rq->h_nr_delayed += h_nr_delayed;
+ cfs_rq->h_nr_runnable += h_nr_runnable;
+ cfs_rq->h_nr_queued++;
+ cfs_rq->h_nr_idle += h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = 1;
+ h_nr_idle = 1;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
}
- if (!rq_h_nr_running && rq->cfs.h_nr_running) {
+ if (!rq_h_nr_queued && rq->cfs.h_nr_queued) {
/* Account for idle runtime */
if (!rq->nr_running)
dl_server_update_idle_time(rq, rq->curr);
@@ -7120,22 +7050,22 @@ static void set_next_buddy(struct sched_entity *se);
static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
{
bool was_sched_idle = sched_idle_rq(rq);
- int rq_h_nr_running = rq->cfs.h_nr_running;
+ int rq_h_nr_queued = rq->cfs.h_nr_queued;
bool task_sleep = flags & DEQUEUE_SLEEP;
bool task_delayed = flags & DEQUEUE_DELAYED;
struct task_struct *p = NULL;
- int idle_h_nr_running = 0;
- int h_nr_running = 0;
- int h_nr_delayed = 0;
+ int h_nr_idle = 0;
+ int h_nr_queued = 0;
+ int h_nr_runnable = 0;
struct cfs_rq *cfs_rq;
u64 slice = 0;
if (entity_is_task(se)) {
p = task_of(se);
- h_nr_running = 1;
- idle_h_nr_running = task_has_idle_policy(p);
- if (!task_sleep && !task_delayed)
- h_nr_delayed = !!se->sched_delayed;
+ h_nr_queued = 1;
+ h_nr_idle = task_has_idle_policy(p);
+ if (task_sleep || task_delayed || !se->sched_delayed)
+ h_nr_runnable = 1;
} else {
cfs_rq = group_cfs_rq(se);
slice = cfs_rq_min_slice(cfs_rq);
@@ -7151,12 +7081,12 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
break;
}
- cfs_rq->h_nr_running -= h_nr_running;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
- cfs_rq->h_nr_delayed -= h_nr_delayed;
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
@@ -7190,21 +7120,21 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
se->slice = slice;
slice = cfs_rq_min_slice(cfs_rq);
- cfs_rq->h_nr_running -= h_nr_running;
- cfs_rq->idle_h_nr_running -= idle_h_nr_running;
- cfs_rq->h_nr_delayed -= h_nr_delayed;
+ cfs_rq->h_nr_runnable -= h_nr_runnable;
+ cfs_rq->h_nr_queued -= h_nr_queued;
+ cfs_rq->h_nr_idle -= h_nr_idle;
if (cfs_rq_is_idle(cfs_rq))
- idle_h_nr_running = h_nr_running;
+ h_nr_idle = h_nr_queued;
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
return 0;
}
- sub_nr_running(rq, h_nr_running);
+ sub_nr_running(rq, h_nr_queued);
- if (rq_h_nr_running && !rq->cfs.h_nr_running)
+ if (rq_h_nr_queued && !rq->cfs.h_nr_queued)
dl_server_stop(&rq->fair_server);
/* balance early to pull high priority tasks */
@@ -8893,7 +8823,7 @@ static struct task_struct *pick_task_fair(struct rq *rq)
again:
cfs_rq = &rq->cfs;
- if (!cfs_rq->nr_running)
+ if (!cfs_rq->nr_queued)
return NULL;
do {
@@ -9010,7 +8940,7 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_stru
static bool fair_server_has_tasks(struct sched_dl_entity *dl_se)
{
- return !!dl_se->rq->cfs.nr_running;
+ return !!dl_se->rq->cfs.nr_queued;
}
static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se)
@@ -9341,43 +9271,43 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
#ifdef CONFIG_NUMA_BALANCING
/*
- * Returns 1, if task migration degrades locality
- * Returns 0, if task migration improves locality i.e migration preferred.
- * Returns -1, if task migration is not affected by locality.
+ * Returns a positive value, if task migration degrades locality.
+ * Returns 0, if task migration is not affected by locality.
+ * Returns a negative value, if task migration improves locality i.e migration preferred.
*/
-static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
{
struct numa_group *numa_group = rcu_dereference(p->numa_group);
unsigned long src_weight, dst_weight;
int src_nid, dst_nid, dist;
if (!static_branch_likely(&sched_numa_balancing))
- return -1;
+ return 0;
if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
- return -1;
+ return 0;
src_nid = cpu_to_node(env->src_cpu);
dst_nid = cpu_to_node(env->dst_cpu);
if (src_nid == dst_nid)
- return -1;
+ return 0;
/* Migrating away from the preferred node is always bad. */
if (src_nid == p->numa_preferred_nid) {
if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
return 1;
else
- return -1;
+ return 0;
}
/* Encourage migration to the preferred node. */
if (dst_nid == p->numa_preferred_nid)
- return 0;
+ return -1;
/* Leaving a core idle is often worse than degrading locality. */
if (env->idle == CPU_IDLE)
- return -1;
+ return 0;
dist = node_distance(src_nid, dst_nid);
if (numa_group) {
@@ -9388,37 +9318,77 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
dst_weight = task_weight(p, dst_nid, dist);
}
- return dst_weight < src_weight;
+ return src_weight - dst_weight;
}
#else
-static inline int migrate_degrades_locality(struct task_struct *p,
+static inline long migrate_degrades_locality(struct task_struct *p,
struct lb_env *env)
{
- return -1;
+ return 0;
}
#endif
/*
+ * Check whether the task is ineligible on the destination cpu
+ *
+ * When the PLACE_LAG scheduling feature is enabled and
+ * dst_cfs_rq->nr_queued is greater than 1, if the task
+ * is ineligible, it will also be ineligible when
+ * it is migrated to the destination cpu.
+ */
+static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_cpu)
+{
+ struct cfs_rq *dst_cfs_rq;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu];
+#else
+ dst_cfs_rq = &cpu_rq(dest_cpu)->cfs;
+#endif
+ if (sched_feat(PLACE_LAG) && dst_cfs_rq->nr_queued &&
+ !entity_eligible(task_cfs_rq(p), &p->se))
+ return 1;
+
+ return 0;
+}
+
+/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
- int tsk_cache_hot;
+ long degrades, hot;
lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot)
+ p->sched_task_hot = 0;
/*
* We do not migrate tasks that are:
- * 1) throttled_lb_pair, or
- * 2) cannot be migrated to this CPU due to cpus_ptr, or
- * 3) running (obviously), or
- * 4) are cache-hot on their current CPU.
+ * 1) delayed dequeued unless we migrate load, or
+ * 2) throttled_lb_pair, or
+ * 3) cannot be migrated to this CPU due to cpus_ptr, or
+ * 4) running (obviously), or
+ * 5) are cache-hot on their current CPU.
*/
+ if ((p->se.sched_delayed) && (env->migration_type != migrate_load))
+ return 0;
+
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
+ /*
+ * We want to prioritize the migration of eligible tasks.
+ * For ineligible tasks we soft-limit them and only allow
+ * them to migrate when nr_balance_failed is non-zero to
+ * avoid load-balancing trying very hard to balance the load.
+ */
+ if (!env->sd->nr_balance_failed &&
+ task_is_ineligible_on_dst_cpu(p, env->dst_cpu))
+ return 0;
+
/* Disregard percpu kthreads; they are where they need to be. */
if (kthread_is_per_cpu(p))
return 0;
@@ -9474,16 +9444,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (env->flags & LBF_ACTIVE_LB)
return 1;
- tsk_cache_hot = migrate_degrades_locality(p, env);
- if (tsk_cache_hot == -1)
- tsk_cache_hot = task_hot(p, env);
+ degrades = migrate_degrades_locality(p, env);
+ if (!degrades)
+ hot = task_hot(p, env);
+ else
+ hot = degrades > 0;
- if (tsk_cache_hot <= 0 ||
- env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
- if (tsk_cache_hot == 1) {
- schedstat_inc(env->sd->lb_hot_gained[env->idle]);
- schedstat_inc(p->stats.nr_forced_migrations);
- }
+ if (!hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+ if (hot)
+ p->sched_task_hot = 1;
return 1;
}
@@ -9498,6 +9467,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_rq_held(env->src_rq);
+ if (p->sched_task_hot) {
+ p->sched_task_hot = 0;
+ schedstat_inc(env->sd->lb_hot_gained[env->idle]);
+ schedstat_inc(p->stats.nr_forced_migrations);
+ }
+
deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
set_task_cpu(p, env->dst_cpu);
}
@@ -9658,6 +9633,9 @@ static int detach_tasks(struct lb_env *env)
continue;
next:
+ if (p->sched_task_hot)
+ schedstat_inc(p->stats.nr_failed_migrations_hot);
+
list_move(&p->se.group_node, tasks);
}
@@ -9800,7 +9778,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq);
- if (cfs_rq->nr_running == 0)
+ if (cfs_rq->nr_queued == 0)
update_idle_cfs_rq_clock_pelt(cfs_rq);
if (cfs_rq == &rq->cfs)
@@ -10332,7 +10310,7 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
* When there is more than 1 task, the group_overloaded case already
* takes care of cpu with reduced capacity
*/
- if (rq->cfs.h_nr_running != 1)
+ if (rq->cfs.h_nr_runnable != 1)
return false;
return check_cpu_capacity(rq, sd);
@@ -10354,7 +10332,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
bool *sg_overloaded,
bool *sg_overutilized)
{
- int i, nr_running, local_group;
+ int i, nr_running, local_group, sd_flags = env->sd->flags;
+ bool balancing_at_rd = !env->sd->parent;
memset(sgs, 0, sizeof(*sgs));
@@ -10367,21 +10346,14 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable;
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
- if (nr_running > 1)
- *sg_overloaded = 1;
-
if (cpu_overutilized(i))
*sg_overutilized = 1;
-#ifdef CONFIG_NUMA_BALANCING
- sgs->nr_numa_running += rq->nr_numa_running;
- sgs->nr_preferred_running += rq->nr_preferred_running;
-#endif
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -10391,10 +10363,21 @@ static inline void update_sg_lb_stats(struct lb_env *env,
continue;
}
+ /* Overload indicator is only updated at root domain */
+ if (balancing_at_rd && nr_running > 1)
+ *sg_overloaded = 1;
+
+#ifdef CONFIG_NUMA_BALANCING
+ /* Only fbq_classify_group() uses this to classify NUMA groups */
+ if (sd_flags & SD_NUMA) {
+ sgs->nr_numa_running += rq->nr_numa_running;
+ sgs->nr_preferred_running += rq->nr_preferred_running;
+ }
+#endif
if (local_group)
continue;
- if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ if (sd_flags & SD_ASYM_CPUCAPACITY) {
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -10682,7 +10665,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_util += cpu_util_without(i, p);
sgs->group_runnable += cpu_runnable_without(rq, p);
local = task_running_on_cpu(i, p);
- sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
+ sgs->sum_h_nr_running += rq->cfs.h_nr_runnable - local;
nr_running = rq->nr_running - local;
sgs->sum_nr_running += nr_running;
@@ -11464,7 +11447,7 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- nr_running = rq->cfs.h_nr_running;
+ nr_running = rq->cfs.h_nr_runnable;
if (!nr_running)
continue;
@@ -11623,7 +11606,7 @@ static int need_active_balance(struct lb_env *env)
* available on dst_cpu.
*/
if (env->idle &&
- (env->src_rq->cfs.h_nr_running == 1)) {
+ (env->src_rq->cfs.h_nr_runnable == 1)) {
if ((check_cpu_capacity(env->src_rq, sd)) &&
(capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
return 1;
@@ -11703,6 +11686,28 @@ static int should_we_balance(struct lb_env *env)
return group_balance_cpu(sg) == env->dst_cpu;
}
+static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd,
+ enum cpu_idle_type idle)
+{
+ if (!schedstat_enabled())
+ return;
+
+ switch (env->migration_type) {
+ case migrate_load:
+ __schedstat_add(sd->lb_imbalance_load[idle], env->imbalance);
+ break;
+ case migrate_util:
+ __schedstat_add(sd->lb_imbalance_util[idle], env->imbalance);
+ break;
+ case migrate_task:
+ __schedstat_add(sd->lb_imbalance_task[idle], env->imbalance);
+ break;
+ case migrate_misfit:
+ __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance);
+ break;
+ }
+}
+
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
@@ -11753,7 +11758,7 @@ redo:
WARN_ON_ONCE(busiest == env.dst_rq);
- schedstat_add(sd->lb_imbalance[idle], env.imbalance);
+ update_lb_imbalance_stat(&env, sd, idle);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
@@ -12251,16 +12256,13 @@ static inline int on_null_domain(struct rq *rq)
* - When one of the busy CPUs notices that there may be an idle rebalancing
* needed, they will kick the idle load balancer, which then does idle
* load balancing for all the idle CPUs.
- *
- * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set
- * anywhere yet.
*/
static inline int find_new_ilb(void)
{
const struct cpumask *hk_mask;
int ilb_cpu;
- hk_mask = housekeeping_cpumask(HK_TYPE_MISC);
+ hk_mask = housekeeping_cpumask(HK_TYPE_KERNEL_NOISE);
for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) {
@@ -12278,7 +12280,8 @@ static inline int find_new_ilb(void)
* Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU
* SMP function call (IPI).
*
- * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one).
+ * We pick the first idle CPU in the HK_TYPE_KERNEL_NOISE housekeeping set
+ * (if there is one).
*/
static void kick_ilb(unsigned int flags)
{
@@ -12366,7 +12369,7 @@ static void nohz_balancer_kick(struct rq *rq)
* If there's a runnable CFS task and the current CPU has reduced
* capacity, kick the ILB to see if there's a better CPU to run on:
*/
- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
+ if (rq->cfs.h_nr_runnable >= 1 && check_cpu_capacity(rq, sd)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12498,10 +12501,6 @@ void nohz_balance_enter_idle(int cpu)
if (!cpu_active(cpu))
return;
- /* Spare idle load balancing on CPUs that don't want to be disturbed: */
- if (!housekeeping_cpu(cpu, HK_TYPE_SCHED))
- return;
-
/*
* Can be set safely without rq->lock held
* If a clear happens, it will have evaluated last additions because
@@ -12721,13 +12720,6 @@ static void nohz_newidle_balance(struct rq *this_rq)
{
int this_cpu = this_rq->cpu;
- /*
- * This CPU doesn't want to be disturbed by scheduler
- * housekeeping
- */
- if (!housekeeping_cpu(this_cpu, HK_TYPE_SCHED))
- return;
-
/* Will wake up very soon. No time for doing anything else*/
if (this_rq->avg_idle < sysctl_sched_migration_cost)
return;
@@ -12864,11 +12856,11 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
* have been enqueued in the meantime. Since we're not going idle,
* pretend we pulled a task.
*/
- if (this_rq->cfs.h_nr_running && !pulled_task)
+ if (this_rq->cfs.h_nr_queued && !pulled_task)
pulled_task = 1;
/* Is there a task of a high priority class? */
- if (this_rq->nr_running != this_rq->cfs.h_nr_running)
+ if (this_rq->nr_running != this_rq->cfs.h_nr_queued)
pulled_task = -1;
out:
@@ -12889,9 +12881,9 @@ out:
/*
* This softirq handler is triggered via SCHED_SOFTIRQ from two places:
*
- * - directly from the local scheduler_tick() for periodic load balancing
+ * - directly from the local sched_tick() for periodic load balancing
*
- * - indirectly from a remote scheduler_tick() for NOHZ idle balancing
+ * - indirectly from a remote sched_tick() for NOHZ idle balancing
* through the SMP cross-call nohz_csd_func()
*/
static __latent_entropy void sched_balance_softirq(void)
@@ -12982,7 +12974,7 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
* MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
* if we need to give up the CPU.
*/
- if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
+ if (rq->core->core_forceidle_count && rq->cfs.nr_queued == 1 &&
__entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
resched_curr(rq);
}
@@ -13126,7 +13118,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (rq->cfs.nr_running == 1)
+ if (rq->cfs.nr_queued == 1)
return;
/*
@@ -13536,7 +13528,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
for_each_possible_cpu(i) {
struct rq *rq = cpu_rq(i);
struct sched_entity *se = tg->se[i];
- struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
long idle_task_delta;
struct rq_flags rf;
@@ -13547,16 +13539,8 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
goto next_cpu;
- if (se->on_rq) {
- parent_cfs_rq = cfs_rq_of(se);
- if (cfs_rq_is_idle(grp_cfs_rq))
- parent_cfs_rq->idle_nr_running++;
- else
- parent_cfs_rq->idle_nr_running--;
- }
-
- idle_task_delta = grp_cfs_rq->h_nr_running -
- grp_cfs_rq->idle_h_nr_running;
+ idle_task_delta = grp_cfs_rq->h_nr_queued -
+ grp_cfs_rq->h_nr_idle;
if (!cfs_rq_is_idle(grp_cfs_rq))
idle_task_delta *= -1;
@@ -13566,7 +13550,7 @@ int sched_group_set_idle(struct task_group *tg, long idle)
if (!se->on_rq)
break;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ cfs_rq->h_nr_idle += idle_task_delta;
/* Already accounted at parent level and above. */
if (cfs_rq_is_idle(cfs_rq))
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index a3d331dd2d8ff..3c12d9f93331d 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -32,6 +32,15 @@ SCHED_FEAT(PREEMPT_SHORT, true)
SCHED_FEAT(NEXT_BUDDY, false)
/*
+ * Allow completely ignoring cfs_rq->next; which can be set from various
+ * places:
+ * - NEXT_BUDDY (wakeup preemption)
+ * - yield_to_task()
+ * - cgroup dequeue / pick
+ */
+SCHED_FEAT(PICK_BUDDY, true)
+
+/*
* Consider buddies to be cache hot, decreases the likeliness of a
* cache buddy being migrated away, increases cache locality.
*/
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 621696269584b..2c85c86b455f7 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -452,19 +452,20 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags)
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
dl_server_update_idle_time(rq, prev);
- scx_update_idle(rq, false);
+ scx_update_idle(rq, false, true);
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
- scx_update_idle(rq, true);
+ scx_update_idle(rq, true, true);
schedstat_inc(rq->sched_goidle);
next->se.exec_start = rq_clock_task(rq);
}
struct task_struct *pick_task_idle(struct rq *rq)
{
+ scx_update_idle(rq, true, false);
return rq->idle;
}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 5891e715f00d0..81bc8b329ef17 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -9,15 +9,9 @@
*/
enum hk_flags {
- HK_FLAG_TIMER = BIT(HK_TYPE_TIMER),
- HK_FLAG_RCU = BIT(HK_TYPE_RCU),
- HK_FLAG_MISC = BIT(HK_TYPE_MISC),
- HK_FLAG_SCHED = BIT(HK_TYPE_SCHED),
- HK_FLAG_TICK = BIT(HK_TYPE_TICK),
HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN),
- HK_FLAG_WQ = BIT(HK_TYPE_WQ),
HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ),
- HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD),
+ HK_FLAG_KERNEL_NOISE = BIT(HK_TYPE_KERNEL_NOISE),
};
DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@@ -97,7 +91,7 @@ void __init housekeeping_init(void)
static_branch_enable(&housekeeping_overridden);
- if (housekeeping.flags & HK_FLAG_TICK)
+ if (housekeeping.flags & HK_FLAG_KERNEL_NOISE)
sched_tick_offload_init();
for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
@@ -121,7 +115,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
unsigned int first_cpu;
int err = 0;
- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) {
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE)) {
if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) {
pr_warn("Housekeeping: nohz unsupported."
" Build with CONFIG_NO_HZ_FULL\n");
@@ -177,7 +171,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags)
housekeeping_setup_type(type, housekeeping_staging);
}
- if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK))
+ if ((flags & HK_FLAG_KERNEL_NOISE) && !(housekeeping.flags & HK_FLAG_KERNEL_NOISE))
tick_nohz_full_setup(non_housekeeping_mask);
housekeeping.flags |= flags;
@@ -195,8 +189,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned long flags;
- flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU |
- HK_FLAG_MISC | HK_FLAG_KTHREAD;
+ flags = HK_FLAG_KERNEL_NOISE;
return housekeeping_setup(str, flags);
}
@@ -210,9 +203,12 @@ static int __init housekeeping_isolcpus_setup(char *str)
int len;
while (isalpha(*str)) {
+ /*
+ * isolcpus=nohz is equivalent to nohz_full.
+ */
if (!strncmp(str, "nohz,", 5)) {
str += 5;
- flags |= HK_FLAG_TICK;
+ flags |= HK_FLAG_KERNEL_NOISE;
continue;
}
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index fee75cc2c47b6..7a8534a2deffd 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -275,7 +275,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
- * se_runnable() = grq->h_nr_running
+ * se_runnable() = grq->h_nr_runnable
*
* runnable_sum = se_runnable() * runnable = grq->runnable_sum
* runnable_avg = runnable_sum
@@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
- cfs_rq->h_nr_running - cfs_rq->h_nr_delayed,
+ cfs_rq->h_nr_runnable,
cfs_rq->curr != NULL)) {
___update_load_avg(&cfs_rq->avg, 1);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 84dad1511d1e4..bb56805e3d476 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -998,7 +998,7 @@ void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_st
s64 delta;
u64 irq;
- if (static_branch_likely(&psi_disabled))
+ if (static_branch_likely(&psi_disabled) || !irqtime_enabled())
return;
if (!curr->pid)
@@ -1240,6 +1240,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ if (!irqtime_enabled() && res == PSI_IRQ)
+ return -EOPNOTSUPP;
+#endif
+
/* Update averages before reporting them */
mutex_lock(&group->avgs_lock);
now = sched_clock();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bd66a46b06aca..4b8e33c615b12 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -26,7 +26,7 @@ static int sched_rt_handler(const struct ctl_table *table, int write, void *buff
size_t *lenp, loff_t *ppos);
static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
-static struct ctl_table sched_rt_sysctls[] = {
+static const struct ctl_table sched_rt_sysctls[] = {
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c5d67a43fe524..c8512a9fb0229 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -362,7 +362,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int dl_bw_check_overflow(int cpu);
+extern int dl_bw_deactivate(int cpu);
extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec);
/*
* SCHED_DEADLINE supports servers (nested scheduling) with the following
@@ -572,7 +572,7 @@ extern void sched_online_group(struct task_group *tg,
extern void sched_destroy_group(struct task_group *tg);
extern void sched_release_group(struct task_group *tg);
-extern void sched_move_task(struct task_struct *tsk);
+extern void sched_move_task(struct task_struct *tsk, bool for_autogroup);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
@@ -650,11 +650,10 @@ struct balance_callback {
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
- unsigned int nr_running;
- unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
- unsigned int idle_nr_running; /* SCHED_IDLE */
- unsigned int idle_h_nr_running; /* SCHED_IDLE */
- unsigned int h_nr_delayed;
+ unsigned int nr_queued;
+ unsigned int h_nr_queued; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_runnable; /* SCHED_{NORMAL,BATCH,IDLE} */
+ unsigned int h_nr_idle; /* SCHED_IDLE */
s64 avg_vruntime;
u64 avg_load;
@@ -760,6 +759,7 @@ enum scx_rq_flags {
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
+ SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */
SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17,
@@ -772,9 +772,10 @@ struct scx_rq {
unsigned long ops_qseq;
u64 extra_enq_flags; /* see move_task_to_local_dsq() */
u32 nr_running;
- u32 flags;
u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */
bool cpu_released;
+ u32 flags;
+ u64 clock; /* current per-rq clock -- see scx_bpf_now() */
cpumask_var_t cpus_to_kick;
cpumask_var_t cpus_to_kick_if_idle;
cpumask_var_t cpus_to_preempt;
@@ -904,11 +905,8 @@ struct dl_rq {
static inline void se_update_runnable(struct sched_entity *se)
{
- if (!entity_is_task(se)) {
- struct cfs_rq *cfs_rq = se->my_q;
-
- se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed;
- }
+ if (!entity_is_task(se))
+ se->runnable_weight = se->my_q->h_nr_runnable;
}
static inline long se_runnable(struct sched_entity *se)
@@ -1726,6 +1724,38 @@ struct rq_flags {
extern struct balance_callback balance_push_callback;
+#ifdef CONFIG_SCHED_CLASS_EXT
+extern const struct sched_class ext_sched_class;
+
+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
+
+#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.clock, clock);
+ smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID);
+}
+
+static inline void scx_rq_clock_invalidate(struct rq *rq)
+{
+ if (!scx_enabled())
+ return;
+ WRITE_ONCE(rq->scx.flags, rq->scx.flags & ~SCX_RQ_CLK_VALID);
+}
+
+#else /* !CONFIG_SCHED_CLASS_EXT */
+#define scx_enabled() false
+#define scx_switched_all() false
+
+static inline void scx_rq_clock_update(struct rq *rq, u64 clock) {}
+static inline void scx_rq_clock_invalidate(struct rq *rq) {}
+#endif /* !CONFIG_SCHED_CLASS_EXT */
+
/*
* Lockdep annotation that avoids accidental unlocks; it's like a
* sticky/continuous lockdep_assert_held().
@@ -1755,7 +1785,7 @@ static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
if (rq->clock_update_flags > RQCF_ACT_SKIP)
rf->clock_update_flags = RQCF_UPDATED;
#endif
-
+ scx_rq_clock_invalidate(rq);
lockdep_unpin_lock(__rq_lockp(rq), rf->cookie);
}
@@ -2280,7 +2310,7 @@ static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
static inline int task_on_rq_queued(struct task_struct *p)
{
- return p->on_rq == TASK_ON_RQ_QUEUED;
+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_QUEUED;
}
static inline int task_on_rq_migrating(struct task_struct *p)
@@ -2514,19 +2544,6 @@ extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
-#ifdef CONFIG_SCHED_CLASS_EXT
-extern const struct sched_class ext_sched_class;
-
-DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); /* SCX BPF scheduler loaded */
-DECLARE_STATIC_KEY_FALSE(__scx_switched_all); /* all fair class tasks on SCX */
-
-#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled)
-#define scx_switched_all() static_branch_unlikely(&__scx_switched_all)
-#else /* !CONFIG_SCHED_CLASS_EXT */
-#define scx_enabled() false
-#define scx_switched_all() false
-#endif /* !CONFIG_SCHED_CLASS_EXT */
-
/*
* Iterate only active classes. SCX can take over all fair tasks or be
* completely disabled. If the former, skip fair. If the latter, skip SCX.
@@ -2574,7 +2591,7 @@ static inline bool sched_rt_runnable(struct rq *rq)
static inline bool sched_fair_runnable(struct rq *rq)
{
- return rq->cfs.nr_running > 0;
+ return rq->cfs.nr_queued > 0;
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
@@ -3242,6 +3259,12 @@ struct irqtime {
};
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
+DECLARE_STATIC_KEY_FALSE(sched_clock_irqtime);
+
+static inline int irqtime_enabled(void)
+{
+ return static_branch_likely(&sched_clock_irqtime);
+}
/*
* Returns the irqtime minus the softirq time computed by ksoftirqd.
@@ -3262,6 +3285,13 @@ static inline u64 irq_time_read(int cpu)
return total;
}
+#else
+
+static inline int irqtime_enabled(void)
+{
+ return 0;
+}
+
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
#ifdef CONFIG_CPU_FREQ
@@ -3509,6 +3539,8 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
#endif /* !CONFIG_HAVE_SCHED_AVG_IRQ */
+extern void __setparam_fair(struct task_struct *p, const struct sched_attr *attr);
+
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus)))
@@ -3666,10 +3698,28 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
{
struct cpumask *cidmask = mm_cidmask(mm);
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid = __this_cpu_read(pcpu_cid->recent_cid);
+ int cid, max_nr_cid, allowed_max_nr_cid;
+ /*
+ * After shrinking the number of threads or reducing the number
+ * of allowed cpus, reduce the value of max_nr_cid so expansion
+ * of cid allocation will preserve cache locality if the number
+ * of threads or allowed cpus increase again.
+ */
+ max_nr_cid = atomic_read(&mm->max_nr_cid);
+ while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed),
+ atomic_read(&mm->mm_users))),
+ max_nr_cid > allowed_max_nr_cid) {
+ /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */
+ if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) {
+ max_nr_cid = allowed_max_nr_cid;
+ break;
+ }
+ }
/* Try to re-use recent cid. This improves cache locality. */
- if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask))
+ cid = __this_cpu_read(pcpu_cid->recent_cid);
+ if (!mm_cid_is_unset(cid) && cid < max_nr_cid &&
+ !cpumask_test_and_set_cpu(cid, cidmask))
return cid;
/*
* Expand cid allocation if the maximum number of concurrency
@@ -3677,8 +3727,9 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
* and number of threads. Expanding cid allocation as much as
* possible improves cache locality.
*/
- cid = atomic_read(&mm->max_nr_cid);
+ cid = max_nr_cid;
while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
+ /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */
if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
continue;
if (!cpumask_test_and_set_cpu(cid, cidmask))
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index eb0cdcd4d9212..4346fd81c31fd 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -103,7 +103,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p,
* Bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 16
+#define SCHEDSTAT_VERSION 17
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -138,14 +138,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
- seq_printf(seq, "domain%d %*pb", dcount++,
+ seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name,
cpumask_pr_args(sched_domain_span(sd)));
for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) {
- seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u",
sd->lb_count[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
- sd->lb_imbalance[itype],
+ sd->lb_imbalance_load[itype],
+ sd->lb_imbalance_util[itype],
+ sd->lb_imbalance_task[itype],
+ sd->lb_imbalance_misfit[itype],
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8ee0add5a48a8..19cdbe96f93de 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -138,6 +138,10 @@ static inline void psi_enqueue(struct task_struct *p, int flags)
if (flags & ENQUEUE_RESTORE)
return;
+ /* psi_sched_switch() will handle the flags */
+ if (task_on_cpu(task_rq(p), p))
+ return;
+
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
@@ -244,7 +248,10 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
delta = rq_clock(rq) - t->sched_info.last_queued;
t->sched_info.last_queued = 0;
t->sched_info.run_delay += delta;
-
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
rq_sched_info_dequeue(rq, delta);
}
@@ -266,6 +273,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
t->sched_info.run_delay += delta;
t->sched_info.last_arrival = now;
t->sched_info.pcount++;
+ if (delta > t->sched_info.max_run_delay)
+ t->sched_info.max_run_delay = delta;
+ if (delta && (!t->sched_info.min_run_delay || delta < t->sched_info.min_run_delay))
+ t->sched_info.min_run_delay = delta;
rq_sched_info_arrive(rq, delta);
}
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index ff0e5ab4e37cb..456d339be98fb 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -300,20 +300,10 @@ static void __setscheduler_params(struct task_struct *p,
p->policy = policy;
- if (dl_policy(policy)) {
+ if (dl_policy(policy))
__setparam_dl(p, attr);
- } else if (fair_policy(policy)) {
- p->static_prio = NICE_TO_PRIO(attr->sched_nice);
- if (attr->sched_runtime) {
- p->se.custom_slice = 1;
- p->se.slice = clamp_t(u64, attr->sched_runtime,
- NSEC_PER_MSEC/10, /* HZ=1000 * 10 */
- NSEC_PER_MSEC*100); /* HZ=100 / 10 */
- } else {
- p->se.custom_slice = 0;
- p->se.slice = sysctl_sched_base_slice;
- }
- }
+ else if (fair_policy(policy))
+ __setparam_fair(p, attr);
/* rt-policy tasks do not have a timerslack */
if (rt_or_dl_task_policy(p)) {
@@ -1140,6 +1130,13 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
return 0;
/*
+ * The special/sugov task isn't part of regular bandwidth/admission
+ * control so let userspace change affinities.
+ */
+ if (dl_entity_is_special(&p->dl))
+ return 0;
+
+ /*
* Since bandwidth control happens on root_domain basis,
* if admission test is enabled, we only admit -deadline
* tasks allowed to run on all the CPUs in the task's
@@ -1433,7 +1430,7 @@ int __sched yield_to(struct task_struct *p, bool preempt)
struct rq *rq, *p_rq;
int yielded = 0;
- scoped_guard (irqsave) {
+ scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
rq = this_rq();
again:
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d6685..c49aea8c10254 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -312,7 +312,7 @@ static int sched_energy_aware_handler(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table sched_energy_aware_sysctls[] = {
+static const struct ctl_table sched_energy_aware_sysctls[] = {
{
.procname = "sched_energy_aware",
.data = &sysctl_sched_energy_aware,
@@ -1635,9 +1635,7 @@ sd_init(struct sched_domain_topology_level *tl,
.max_newidle_lb_cost = 0,
.last_decay_max_lb_cost = jiffies,
.child = child,
-#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
-#endif
};
sd_span = sched_domain_span(sd);
@@ -2338,10 +2336,8 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
-#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
-#endif
/* Fixup, ensure @sd has at least @child CPUs. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
@@ -2721,9 +2717,11 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
/*
* This domain won't be destroyed and as such
- * its dl_bw->total_bw needs to be cleared. It
- * will be recomputed in function
- * update_tasks_root_domain().
+ * its dl_bw->total_bw needs to be cleared.
+ * Tasks contribution will be then recomputed
+ * in function dl_update_tasks_root_domain(),
+ * dl_servers contribution in function
+ * dl_restore_server_root_domain().
*/
rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
dl_clear_root_domain(rd);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 385d48293a5fa..7bbb408431ebc 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -749,6 +749,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
if (WARN_ON_ONCE(!fprog))
return false;
+ /* Our single exception to filtering. */
+#ifdef __NR_uretprobe
+#ifdef SECCOMP_ARCH_COMPAT
+ if (sd->arch == SECCOMP_ARCH_NATIVE)
+#endif
+ if (sd->nr == __NR_uretprobe)
+ return true;
+#endif
+
for (pc = 0; pc < fprog->len; pc++) {
struct sock_filter *insn = &fprog->filter[pc];
u16 code = insn->code;
@@ -1023,6 +1032,9 @@ static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
*/
static const int mode1_syscalls[] = {
__NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
+#ifdef __NR_uretprobe
+ __NR_uretprobe,
+#endif
-1, /* negative terminated */
};
@@ -2450,7 +2462,7 @@ static int seccomp_actions_logged_handler(const struct ctl_table *ro_table, int
return ret;
}
-static struct ctl_table seccomp_sysctl_table[] = {
+static const struct ctl_table seccomp_sysctl_table[] = {
{
.procname = "actions_avail",
.data = (void *) &seccomp_actions_avail,
diff --git a/kernel/signal.c b/kernel/signal.c
index 989b1cc9116a2..875e97f6205a2 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2007,11 +2007,22 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
if (!list_empty(&q->list)) {
/*
- * If task group is exiting with the signal already pending,
- * wait for __exit_signal() to do its job. Otherwise if
- * ignored, it's not supposed to be queued. Try to survive.
+ * The signal was ignored and blocked. The timer
+ * expiry queued it because blocked signals are
+ * queued independent of the ignored state.
+ *
+ * The unblocking set SIGPENDING, but the signal
+ * was not yet dequeued from the pending list.
+ * So prepare_signal() sees unblocked and ignored,
+ * which ends up here. Leave it queued like a
+ * regular signal.
+ *
+ * The same happens when the task group is exiting
+ * and the signal is already queued.
+ * prepare_signal() treats SIGNAL_GROUP_EXIT as
+ * ignored independent of its queued state. This
+ * gets cleaned up in __exit_signal().
*/
- WARN_ON_ONCE(!(t->signal->flags & SIGNAL_GROUP_EXIT));
goto out;
}
@@ -2046,17 +2057,25 @@ void posixtimer_send_sigqueue(struct k_itimer *tmr)
goto out;
}
- /* This should never happen and leaks a reference count */
- if (WARN_ON_ONCE(!hlist_unhashed(&tmr->ignored_list)))
- hlist_del_init(&tmr->ignored_list);
-
if (unlikely(!list_empty(&q->list))) {
/* This holds a reference count already */
result = TRACE_SIGNAL_ALREADY_PENDING;
goto out;
}
- posixtimer_sigqueue_getref(q);
+ /*
+ * If the signal is on the ignore list, it got blocked after it was
+ * ignored earlier. But nothing lifted the ignore. Move it back to
+ * the pending list to be consistent with the regular signal
+ * handling. This already holds a reference count.
+ *
+ * If it's not on the ignore list acquire a reference count.
+ */
+ if (likely(hlist_unhashed(&tmr->ignored_list)))
+ posixtimer_sigqueue_getref(q);
+ else
+ hlist_del_init(&tmr->ignored_list);
+
posixtimer_queue_sigqueue(q, t, tmr->it_pid_type);
result = TRACE_SIGNAL_DELIVERED;
out:
@@ -4931,7 +4950,7 @@ static inline void siginfo_buildtime_checks(void)
}
#if defined(CONFIG_SYSCTL)
-static struct ctl_table signal_debug_table[] = {
+static const struct ctl_table signal_debug_table[] = {
#ifdef CONFIG_SYSCTL_EXCEPTION_TRACE
{
.procname = "exception-trace",
diff --git a/kernel/smp.c b/kernel/smp.c
index 27dc31a146a35..974f3a3962e8d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -170,9 +170,9 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
static DEFINE_PER_CPU(void *, cur_csd_info);
static ulong csd_lock_timeout = 5000; /* CSD lock timeout in milliseconds. */
-module_param(csd_lock_timeout, ulong, 0444);
+module_param(csd_lock_timeout, ulong, 0644);
static int panic_on_ipistall; /* CSD panic timeout in milliseconds, 300000 for five minutes. */
-module_param(panic_on_ipistall, int, 0444);
+module_param(panic_on_ipistall, int, 0644);
static atomic_t csd_bug_count = ATOMIC_INIT(0);
@@ -815,7 +815,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
WARN_ON_ONCE(!in_task());
/* Check if we need local execution. */
- if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask))
+ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask) &&
+ (!cond_func || cond_func(this_cpu, info)))
run_local = true;
/* Check if we need remote execution, i.e., any CPU excluding this one. */
@@ -868,7 +869,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
send_call_function_ipi_mask(cfd->cpumask_ipi);
}
- if (run_local && (!cond_func || cond_func(this_cpu, info))) {
+ if (run_local) {
unsigned long flags;
local_irq_save(flags);
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index 39fd620a7db6f..bb65321761b43 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -15,6 +15,7 @@
#ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE
#include <linux/jump_label.h>
+#include <linux/string_choices.h>
#include <linux/sysctl.h>
#include <linux/init.h>
@@ -41,10 +42,10 @@ static int stack_erasing_sysctl(const struct ctl_table *table, int write,
static_branch_enable(&stack_erasing_bypass);
pr_warn("stackleak: kernel stack erasing is %s\n",
- state ? "enabled" : "disabled");
+ str_enabled_disabled(state));
return ret;
}
-static struct ctl_table stackleak_sysctls[] = {
+static const struct ctl_table stackleak_sysctls[] = {
{
.procname = "stack_erasing",
.data = NULL,
diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c
index 5259cda486d05..bb7d066a7c397 100644
--- a/kernel/static_call_inline.c
+++ b/kernel/static_call_inline.c
@@ -15,7 +15,7 @@ extern struct static_call_site __start_static_call_sites[],
extern struct static_call_tramp_key __start_static_call_tramp_key[],
__stop_static_call_tramp_key[];
-static int static_call_initialized;
+int static_call_initialized;
/*
* Must be called before early_initcall() to be effective.
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index da821ce258ea7..8896d844d738f 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -250,8 +250,8 @@ static int multi_cpu_stop(void *data)
* be detected and reported on their side.
*/
touch_nmi_watchdog();
+ rcu_momentary_eqs();
}
- rcu_momentary_eqs();
} while (curstate != MULTI_STOP_EXIT);
local_irq_restore(flags);
diff --git a/kernel/sys.c b/kernel/sys.c
index c4c701c6f0b4d..cb366ff8703af 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -75,6 +75,8 @@
#include <asm/io.h>
#include <asm/unistd.h>
+#include <trace/events/task.h>
+
#include "uid16.h"
#ifndef SET_UNALIGN_CTL
@@ -2810,6 +2812,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = arch_lock_shadow_stack_status(me, arg2);
break;
default:
+ trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
error = -EINVAL;
break;
}
diff --git a/kernel/sysctl-test.c b/kernel/sysctl-test.c
index 3ac98bb7fb822..eb2842bd05577 100644
--- a/kernel/sysctl-test.c
+++ b/kernel/sysctl-test.c
@@ -374,7 +374,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value(
struct kunit *test)
{
unsigned char data = 0;
- struct ctl_table table_foo[] = {
+ const struct ctl_table table_foo[] = {
{
.procname = "foo",
.data = &data,
@@ -386,7 +386,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value(
},
};
- struct ctl_table table_bar[] = {
+ const struct ctl_table table_bar[] = {
{
.procname = "bar",
.data = &data,
@@ -398,7 +398,7 @@ static void sysctl_test_register_sysctl_sz_invalid_extra_value(
},
};
- struct ctl_table table_qux[] = {
+ const struct ctl_table table_qux[] = {
{
.procname = "qux",
.data = &data,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5c9202cb8f59f..cb57da499ebb1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1609,7 +1609,7 @@ int proc_do_static_key(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table kern_table[] = {
+static const struct ctl_table kern_table[] = {
{
.procname = "panic",
.data = &panic_timeout,
@@ -1804,15 +1804,6 @@ static struct ctl_table kern_table[] = {
},
#endif
{
- .procname = "pid_max",
- .data = &pid_max,
- .maxlen = sizeof (int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &pid_max_min,
- .extra2 = &pid_max_max,
- },
- {
.procname = "panic_on_oops",
.data = &panic_on_oops,
.maxlen = sizeof(int),
@@ -2030,7 +2021,7 @@ static struct ctl_table kern_table[] = {
#endif
};
-static struct ctl_table vm_table[] = {
+static const struct ctl_table vm_table[] = {
{
.procname = "overcommit_memory",
.data = &sysctl_overcommit_memory,
diff --git a/kernel/task_work.c b/kernel/task_work.c
index c969f1f26be58..d1efec571a4a4 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -55,26 +55,14 @@ int task_work_add(struct task_struct *task, struct callback_head *work,
enum task_work_notify_mode notify)
{
struct callback_head *head;
- int flags = notify & TWA_FLAGS;
- notify &= ~TWA_FLAGS;
if (notify == TWA_NMI_CURRENT) {
if (WARN_ON_ONCE(task != current))
return -EINVAL;
if (!IS_ENABLED(CONFIG_IRQ_WORK))
return -EINVAL;
} else {
- /*
- * Record the work call stack in order to print it in KASAN
- * reports.
- *
- * Note that stack allocation can fail if TWAF_NO_ALLOC flag
- * is set and new page is needed to expand the stack buffer.
- */
- if (flags & TWAF_NO_ALLOC)
- kasan_record_aux_stack_noalloc(work);
- else
- kasan_record_aux_stack(work);
+ kasan_record_aux_stack(work);
}
head = READ_ONCE(task->task_works);
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index 62e73444ffe45..38dae590b29f5 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -137,7 +137,8 @@ static int wdtest_func(void *arg)
udelay(1);
j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime);
pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1);
- WARN_ON_ONCE(time_before(j2, j1 + NSEC_PER_USEC));
+ WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC),
+ "Expected at least 1000ns, got %lu.\n", j2 - j1);
/* Verify tsc-like stability with various numbers of errors injected. */
max_retries = clocksource_get_max_watchdog_retry();
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 7304d7cf47f2d..2a7802ec480cc 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -373,16 +373,18 @@ void clocksource_verify_percpu(struct clocksource *cs)
cpumask_clear(&cpus_ahead);
cpumask_clear(&cpus_behind);
cpus_read_lock();
- preempt_disable();
+ migrate_disable();
clocksource_verify_choose_cpus();
if (cpumask_empty(&cpus_chosen)) {
- preempt_enable();
+ migrate_enable();
cpus_read_unlock();
pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name);
return;
}
testcpu = smp_processor_id();
- pr_warn("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n",
+ cs->name, testcpu, cpumask_pr_args(&cpus_chosen));
+ preempt_disable();
for_each_cpu(cpu, &cpus_chosen) {
if (cpu == testcpu)
continue;
@@ -402,6 +404,7 @@ void clocksource_verify_percpu(struct clocksource *cs)
cs_nsec_min = cs_nsec;
}
preempt_enable();
+ migrate_enable();
cpus_read_unlock();
if (!cpumask_empty(&cpus_ahead))
pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n",
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 80fe3749d2db1..deb1aa32814e3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -58,6 +58,8 @@
#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT)
#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD)
+static void retrigger_next_event(void *arg);
+
/*
* The timer bases:
*
@@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clockid = CLOCK_TAI,
.get_time = &ktime_get_clocktai,
},
- }
+ },
+ .csd = CSD_INIT(retrigger_next_event, NULL)
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
@@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
+static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base)
+{
+ if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ else
+ return likely(base->online);
+}
+
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -145,11 +156,6 @@ static struct hrtimer_cpu_base migration_cpu_base = {
#define migration_base migration_cpu_base.clock_base[0]
-static inline bool is_migration_base(struct hrtimer_clock_base *base)
-{
- return base == &migration_base;
-}
-
/*
* We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
* means that all timers which are tied to this base via timer->base are
@@ -183,27 +189,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
}
/*
- * We do not migrate the timer when it is expiring before the next
- * event on the target cpu. When high resolution is enabled, we cannot
- * reprogram the target cpu hardware and we would cause it to fire
- * late. To keep it simple, we handle the high resolution enabled and
- * disabled case similar.
+ * Check if the elected target is suitable considering its next
+ * event and the hotplug state of the current CPU.
+ *
+ * If the elected target is remote and its next event is after the timer
+ * to queue, then a remote reprogram is necessary. However there is no
+ * guarantee the IPI handling the operation would arrive in time to meet
+ * the high resolution deadline. In this case the local CPU becomes a
+ * preferred target, unless it is offline.
+ *
+ * High and low resolution modes are handled the same way for simplicity.
*
* Called with cpu_base->lock of target cpu held.
*/
-static int
-hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
+static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base,
+ struct hrtimer_cpu_base *new_cpu_base,
+ struct hrtimer_cpu_base *this_cpu_base)
{
ktime_t expires;
+ /*
+ * The local CPU clockevent can be reprogrammed. Also get_target_base()
+ * guarantees it is online.
+ */
+ if (new_cpu_base == this_cpu_base)
+ return true;
+
+ /*
+ * The offline local CPU can't be the default target if the
+ * next remote target event is after this timer. Keep the
+ * elected new base. An IPI will we issued to reprogram
+ * it as a last resort.
+ */
+ if (!hrtimer_base_is_online(this_cpu_base))
+ return true;
+
expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset);
- return expires < new_base->cpu_base->expires_next;
+
+ return expires >= new_base->cpu_base->expires_next;
}
-static inline
-struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
- int pinned)
+static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned)
{
+ if (!hrtimer_base_is_online(base)) {
+ int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER));
+
+ return &per_cpu(hrtimer_bases, cpu);
+ }
+
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
if (static_branch_likely(&timers_migration_enabled) && !pinned)
return &per_cpu(hrtimer_bases, get_nohz_timer_target());
@@ -254,8 +287,8 @@ again:
raw_spin_unlock(&base->cpu_base->lock);
raw_spin_lock(&new_base->cpu_base->lock);
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base,
+ this_cpu_base)) {
raw_spin_unlock(&new_base->cpu_base->lock);
raw_spin_lock(&base->cpu_base->lock);
new_cpu_base = this_cpu_base;
@@ -264,8 +297,7 @@ again:
}
WRITE_ONCE(timer->base, new_base);
} else {
- if (new_cpu_base != this_cpu_base &&
- hrtimer_check_target(timer, new_base)) {
+ if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) {
new_cpu_base = this_cpu_base;
goto again;
}
@@ -275,11 +307,6 @@ again:
#else /* CONFIG_SMP */
-static inline bool is_migration_base(struct hrtimer_clock_base *base)
-{
- return false;
-}
-
static inline struct hrtimer_clock_base *
lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
__acquires(&timer->base->cpu_base->lock)
@@ -716,8 +743,6 @@ static inline int hrtimer_is_hres_enabled(void)
return hrtimer_hres_enabled;
}
-static void retrigger_next_event(void *arg);
-
/*
* Switch to high resolution mode
*/
@@ -1067,11 +1092,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward);
* The timer is inserted in expiry order. Insertion into the
* red black tree is O(log(n)). Must hold the base lock.
*
- * Returns 1 when the new timer is the leftmost timer in the tree.
+ * Returns true when the new timer is the leftmost timer in the tree.
*/
-static int enqueue_hrtimer(struct hrtimer *timer,
- struct hrtimer_clock_base *base,
- enum hrtimer_mode mode)
+static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base,
+ enum hrtimer_mode mode)
{
debug_activate(timer, mode);
WARN_ON_ONCE(!base->cpu_base->online);
@@ -1206,6 +1230,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
u64 delta_ns, const enum hrtimer_mode mode,
struct hrtimer_clock_base *base)
{
+ struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases);
struct hrtimer_clock_base *new_base;
bool force_local, first;
@@ -1217,10 +1242,16 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
* and enforce reprogramming after it is queued no matter whether
* it is the new first expiring timer again or not.
*/
- force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
+ force_local = base->cpu_base == this_cpu_base;
force_local &= base->cpu_base->next_timer == timer;
/*
+ * Don't force local queuing if this enqueue happens on a unplugged
+ * CPU after hrtimer_cpu_dying() has been invoked.
+ */
+ force_local &= this_cpu_base->online;
+
+ /*
* Remove an active timer from the queue. In case it is not queued
* on the current CPU, make sure that remove_hrtimer() updates the
* remote data correctly.
@@ -1249,8 +1280,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
}
first = enqueue_hrtimer(timer, new_base, mode);
- if (!force_local)
- return first;
+ if (!force_local) {
+ /*
+ * If the current CPU base is online, then the timer is
+ * never queued on a remote CPU if it would be the first
+ * expiring timer there.
+ */
+ if (hrtimer_base_is_online(this_cpu_base))
+ return first;
+
+ /*
+ * Timer was enqueued remote because the current base is
+ * already offline. If the timer is the first to expire,
+ * kick the remote CPU to reprogram the clock event.
+ */
+ if (first) {
+ struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base;
+
+ smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd);
+ }
+ return 0;
+ }
/*
* Timer was forced to stay on the current CPU to avoid
@@ -1371,6 +1421,18 @@ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base,
}
}
+#ifdef CONFIG_SMP
+static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return base == &migration_base;
+}
+#else
+static __always_inline bool is_migration_base(struct hrtimer_clock_base *base)
+{
+ return false;
+}
+#endif
+
/*
* This function is called on PREEMPT_RT kernels when the fast path
* deletion of a timer failed because the timer callback function was
@@ -2202,6 +2264,15 @@ int hrtimers_prepare_cpu(unsigned int cpu)
}
cpu_base->cpu = cpu;
+ hrtimer_cpu_base_init_expiry_lock(cpu_base);
+ return 0;
+}
+
+int hrtimers_cpu_starting(unsigned int cpu)
+{
+ struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+
+ /* Clear out any left over state from a CPU down operation */
cpu_base->active_bases = 0;
cpu_base->hres_active = 0;
cpu_base->hang_detected = 0;
@@ -2210,7 +2281,6 @@ int hrtimers_prepare_cpu(unsigned int cpu)
cpu_base->expires_next = KTIME_MAX;
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->online = 1;
- hrtimer_cpu_base_init_expiry_lock(cpu_base);
return 0;
}
@@ -2286,5 +2356,6 @@ int hrtimers_cpu_dying(unsigned int dying_cpu)
void __init hrtimers_init(void)
{
hrtimers_prepare_cpu(smp_processor_id());
+ hrtimers_cpu_starting(smp_processor_id());
open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq);
}
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 881a9ce96af77..1b675aee99a98 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -538,7 +538,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
* When the reference count reaches zero, the timer is scheduled
* for RCU removal after the grace period.
*
- * Holding rcu_read_lock() accross the lookup ensures that
+ * Holding rcu_read_lock() across the lookup ensures that
* the timer cannot be freed.
*
* The lookup validates locklessly that timr::it_signal ==
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index ed58eebb4e8f4..0207868c8b4d2 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1020,6 +1020,8 @@ static inline ktime_t tick_get_next_period(void)
/**
* tick_broadcast_setup_oneshot - setup the broadcast device
+ * @bc: the broadcast device
+ * @from_periodic: true if called from periodic mode
*/
static void tick_broadcast_setup_oneshot(struct clock_event_device *bc,
bool from_periodic)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3d128825d3437..1e67d076f1955 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -485,91 +485,30 @@ u64 notrace ktime_get_tai_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_tai_fast_ns);
-static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
+/**
+ * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ *
+ * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
+ */
+u64 ktime_get_real_fast_ns(void)
{
+ struct tk_fast *tkf = &tk_fast_mono;
struct tk_read_base *tkr;
- u64 basem, baser, delta;
+ u64 baser, delta;
unsigned int seq;
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- basem = ktime_to_ns(tkr->base);
baser = ktime_to_ns(tkr->base_real);
delta = timekeeping_get_ns(tkr);
} while (raw_read_seqcount_latch_retry(&tkf->seq, seq));
- if (mono)
- *mono = basem + delta;
return baser + delta;
}
-
-/**
- * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
- *
- * See ktime_get_mono_fast_ns() for documentation of the time stamp ordering.
- */
-u64 ktime_get_real_fast_ns(void)
-{
- return __ktime_get_real_fast(&tk_fast_mono, NULL);
-}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
/**
- * ktime_get_fast_timestamps: - NMI safe timestamps
- * @snapshot: Pointer to timestamp storage
- *
- * Stores clock monotonic, boottime and realtime timestamps.
- *
- * Boot time is a racy access on 32bit systems if the sleep time injection
- * happens late during resume and not in timekeeping_resume(). That could
- * be avoided by expanding struct tk_read_base with boot offset for 32bit
- * and adding more overhead to the update. As this is a hard to observe
- * once per resume event which can be filtered with reasonable effort using
- * the accurate mono/real timestamps, it's probably not worth the trouble.
- *
- * Aside of that it might be possible on 32 and 64 bit to observe the
- * following when the sleep time injection happens late:
- *
- * CPU 0 CPU 1
- * timekeeping_resume()
- * ktime_get_fast_timestamps()
- * mono, real = __ktime_get_real_fast()
- * inject_sleep_time()
- * update boot offset
- * boot = mono + bootoffset;
- *
- * That means that boot time already has the sleep time adjustment, but
- * real time does not. On the next readout both are in sync again.
- *
- * Preventing this for 64bit is not really feasible without destroying the
- * careful cache layout of the timekeeper because the sequence count and
- * struct tk_read_base would then need two cache lines instead of one.
- *
- * Access to the time keeper clock source is disabled across the innermost
- * steps of suspend/resume. The accessors still work, but the timestamps
- * are frozen until time keeping is resumed which happens very early.
- *
- * For regular suspend/resume there is no observable difference vs. sched
- * clock, but it might affect some of the nasty low level debug printks.
- *
- * OTOH, access to sched clock is not guaranteed across suspend/resume on
- * all systems either so it depends on the hardware in use.
- *
- * If that turns out to be a real problem then this could be mitigated by
- * using sched clock in a similar way as during early boot. But it's not as
- * trivial as on early boot because it needs some careful protection
- * against the clock monotonic timestamp jumping backwards on resume.
- */
-void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
-
- snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
- snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
-}
-
-/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a5860bf6d16f9..c8f776dc6ee08 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -301,7 +301,7 @@ static int timer_migration_handler(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table timer_sysctl[] = {
+static const struct ctl_table timer_sysctl[] = {
{
.procname = "timer_migration",
.data = &sysctl_timer_migration,
@@ -956,33 +956,29 @@ static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
{
int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
- struct timer_base *base;
-
- base = per_cpu_ptr(&timer_bases[index], cpu);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
* to use the deferrable base.
*/
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
- base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
- return base;
+ index = BASE_DEF;
+
+ return per_cpu_ptr(&timer_bases[index], cpu);
}
static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
{
int index = tflags & TIMER_PINNED ? BASE_LOCAL : BASE_GLOBAL;
- struct timer_base *base;
-
- base = this_cpu_ptr(&timer_bases[index]);
/*
* If the timer is deferrable and NO_HZ_COMMON is set then we need
* to use the deferrable base.
*/
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
- base = this_cpu_ptr(&timer_bases[BASE_DEF]);
- return base;
+ index = BASE_DEF;
+
+ return this_cpu_ptr(&timer_bases[index]);
}
static inline struct timer_base *get_timer_base(u32 tflags)
diff --git a/kernel/time/timer_migration.c b/kernel/time/timer_migration.c
index 8d57f7686bb03..2f6330831f084 100644
--- a/kernel/time/timer_migration.c
+++ b/kernel/time/timer_migration.c
@@ -534,8 +534,13 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
break;
child = group;
- group = group->parent;
+ /*
+ * Pairs with the store release on group connection
+ * to make sure group initialization is visible.
+ */
+ group = READ_ONCE(group->parent);
data->childmask = child->groupmask;
+ WARN_ON_ONCE(!data->childmask);
} while (group);
}
@@ -564,7 +569,7 @@ static struct tmigr_event *tmigr_next_groupevt(struct tmigr_group *group)
while ((node = timerqueue_getnext(&group->events))) {
evt = container_of(node, struct tmigr_event, nextevt);
- if (!evt->ignore) {
+ if (!READ_ONCE(evt->ignore)) {
WRITE_ONCE(group->next_expiry, evt->nextevt.expires);
return evt;
}
@@ -660,7 +665,7 @@ static bool tmigr_active_up(struct tmigr_group *group,
* lock is held while updating the ignore flag in idle path. So this
* state change will not be lost.
*/
- group->groupevt.ignore = true;
+ WRITE_ONCE(group->groupevt.ignore, true);
return walk_done;
}
@@ -721,6 +726,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
union tmigr_state childstate, groupstate;
bool remote = data->remote;
bool walk_done = false;
+ bool ignore;
u64 nextexp;
if (child) {
@@ -739,11 +745,19 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
nextexp = child->next_expiry;
evt = &child->groupevt;
- evt->ignore = (nextexp == KTIME_MAX) ? true : false;
+ /*
+ * This can race with concurrent idle exit (activate).
+ * If the current writer wins, a useless remote expiration may
+ * be scheduled. If the activate wins, the event is properly
+ * ignored.
+ */
+ ignore = (nextexp == KTIME_MAX) ? true : false;
+ WRITE_ONCE(evt->ignore, ignore);
} else {
nextexp = data->nextexp;
first_childevt = evt = data->evt;
+ ignore = evt->ignore;
/*
* Walking the hierarchy is required in any case when a
@@ -769,7 +783,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
* first event information of the group is updated properly and
* also handled properly, so skip this fast return path.
*/
- if (evt->ignore && !remote && group->parent)
+ if (ignore && !remote && group->parent)
return true;
raw_spin_lock(&group->lock);
@@ -783,7 +797,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
* queue when the expiry time changed only or when it could be ignored.
*/
if (timerqueue_node_queued(&evt->nextevt)) {
- if ((evt->nextevt.expires == nextexp) && !evt->ignore) {
+ if ((evt->nextevt.expires == nextexp) && !ignore) {
/* Make sure not to miss a new CPU event with the same expiry */
evt->cpu = first_childevt->cpu;
goto check_toplvl;
@@ -793,7 +807,7 @@ bool tmigr_update_events(struct tmigr_group *group, struct tmigr_group *child,
WRITE_ONCE(group->next_expiry, KTIME_MAX);
}
- if (evt->ignore) {
+ if (ignore) {
/*
* When the next child event could be ignored (nextexp is
* KTIME_MAX) and there was no remote timer handling before or
@@ -1487,6 +1501,21 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
s.seq = 0;
atomic_set(&group->migr_state, s.state);
+ /*
+ * If this is a new top-level, prepare its groupmask in advance.
+ * This avoids accidents where yet another new top-level is
+ * created in the future and made visible before the current groupmask.
+ */
+ if (list_empty(&tmigr_level_list[lvl])) {
+ group->groupmask = BIT(0);
+ /*
+ * The previous top level has prepared its groupmask already,
+ * simply account it as the first child.
+ */
+ if (lvl > 0)
+ group->num_children = 1;
+ }
+
timerqueue_init_head(&group->events);
timerqueue_init(&group->groupevt.nextevt);
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1550,8 +1579,25 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
raw_spin_lock_irq(&child->lock);
raw_spin_lock_nested(&parent->lock, SINGLE_DEPTH_NESTING);
- child->parent = parent;
- child->groupmask = BIT(parent->num_children++);
+ if (activate) {
+ /*
+ * @child is the old top and @parent the new one. In this
+ * case groupmask is pre-initialized and @child already
+ * accounted, along with its new sibling corresponding to the
+ * CPU going up.
+ */
+ WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
+ } else {
+ /* Adding @child for the CPU going up to @parent. */
+ child->groupmask = BIT(parent->num_children++);
+ }
+
+ /*
+ * Make sure parent initialization is visible before publishing it to a
+ * racing CPU entering/exiting idle. This RELEASE barrier enforces an
+ * address dependency that pairs with the READ_ONCE() in __walk_groups().
+ */
+ smp_store_release(&child->parent, parent);
raw_spin_unlock(&parent->lock);
raw_spin_unlock_irq(&child->lock);
@@ -1624,13 +1670,14 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
* be different from tmigr_hierarchy_levels, contains only a
* single group.
*/
- if (group->parent || i == tmigr_hierarchy_levels ||
- (list_empty(&tmigr_level_list[i]) &&
- list_is_singular(&tmigr_level_list[i - 1])))
+ if (group->parent || list_is_singular(&tmigr_level_list[i - 1]))
break;
} while (i < tmigr_hierarchy_levels);
+ /* Assert single root */
+ WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top]));
+
while (i > 0) {
group = stack[--i];
@@ -1672,7 +1719,12 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
WARN_ON_ONCE(top == 0);
lvllist = &tmigr_level_list[top];
- if (group->num_children == 1 && list_is_singular(lvllist)) {
+
+ /*
+ * Newly created root level should have accounted the upcoming
+ * CPU's child group and pre-accounted the old root.
+ */
+ if (group->num_children == 2 && list_is_singular(lvllist)) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
diff --git a/kernel/time/timer_migration.h b/kernel/time/timer_migration.h
index 154accc7a543c..ae19f70f8170f 100644
--- a/kernel/time/timer_migration.h
+++ b/kernel/time/timer_migration.h
@@ -110,22 +110,19 @@ struct tmigr_cpu {
* union tmigr_state - state of tmigr_group
* @state: Combined version of the state - only used for atomic
* read/cmpxchg function
- * @struct: Split version of the state - only use the struct members to
+ * &anon struct: Split version of the state - only use the struct members to
* update information to stay independent of endianness
+ * @active: Contains each mask bit of the active children
+ * @migrator: Contains mask of the child which is migrator
+ * @seq: Sequence counter needs to be increased when an update
+ * to the tmigr_state is done. It prevents a race when
+ * updates in the child groups are propagated in changed
+ * order. Detailed information about the scenario is
+ * given in the documentation at the begin of
+ * timer_migration.c.
*/
union tmigr_state {
u32 state;
- /**
- * struct - split state of tmigr_group
- * @active: Contains each mask bit of the active children
- * @migrator: Contains mask of the child which is migrator
- * @seq: Sequence counter needs to be increased when an update
- * to the tmigr_state is done. It prevents a race when
- * updates in the child groups are propagated in changed
- * order. Detailed information about the scenario is
- * given in the documentation at the begin of
- * timer_migration.c.
- */
struct {
u8 active;
u8 migrator;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 74c2b1d43bb98..d570b8b9c0a9b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -31,9 +31,14 @@ config HAVE_FUNCTION_GRAPH_TRACER
help
See Documentation/trace/ftrace-design.rst
-config HAVE_FUNCTION_GRAPH_RETVAL
+config HAVE_FUNCTION_GRAPH_FREGS
bool
+config HAVE_FTRACE_GRAPH_FUNC
+ bool
+ help
+ True if ftrace_graph_func() is defined.
+
config HAVE_DYNAMIC_FTRACE
bool
help
@@ -57,6 +62,12 @@ config HAVE_DYNAMIC_FTRACE_WITH_ARGS
This allows for use of ftrace_regs_get_argument() and
ftrace_regs_get_stack_pointer().
+config HAVE_FTRACE_REGS_HAVING_PT_REGS
+ bool
+ help
+ If this is set, ftrace_regs has pt_regs, thus it can convert to
+ pt_regs without allocating memory.
+
config HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
bool
help
@@ -232,7 +243,7 @@ config FUNCTION_GRAPH_TRACER
config FUNCTION_GRAPH_RETVAL
bool "Kernel Function Graph Return Value"
- depends on HAVE_FUNCTION_GRAPH_RETVAL
+ depends on HAVE_FUNCTION_GRAPH_FREGS
depends on FUNCTION_GRAPH_TRACER
default n
help
@@ -296,10 +307,9 @@ config DYNAMIC_FTRACE_WITH_ARGS
config FPROBE
bool "Kernel Function Probe (fprobe)"
- depends on FUNCTION_TRACER
- depends on DYNAMIC_FTRACE_WITH_REGS
- depends on HAVE_RETHOOK
- select RETHOOK
+ depends on HAVE_FUNCTION_GRAPH_FREGS && HAVE_FTRACE_GRAPH_FUNC
+ depends on DYNAMIC_FTRACE_WITH_ARGS
+ select FUNCTION_GRAPH_TRACER
default n
help
This option enables kernel function probe (fprobe) based on ftrace.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8fd292d34d898..3679a6d189346 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -617,8 +617,9 @@ err:
return ret;
}
-static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev, char __user *arg)
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
@@ -627,29 +628,18 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (ret)
return -EFAULT;
+ mutex_lock(&q->debugfs_mutex);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ mutex_unlock(&q->debugfs_mutex);
if (ret)
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- __blk_trace_remove(q);
+ blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
-
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
-{
- int ret;
-
- mutex_lock(&q->debugfs_mutex);
- ret = __blk_trace_setup(q, name, dev, bdev, arg);
- mutex_unlock(&q->debugfs_mutex);
-
- return ret;
-}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -673,12 +663,14 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
.pid = cbuts.pid,
};
+ mutex_lock(&q->debugfs_mutex);
ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
+ mutex_unlock(&q->debugfs_mutex);
if (ret)
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- __blk_trace_remove(q);
+ blk_trace_remove(q);
return -EFAULT;
}
@@ -732,12 +724,10 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
int ret, start = 0;
char b[BDEVNAME_SIZE];
- mutex_lock(&q->debugfs_mutex);
-
switch (cmd) {
case BLKTRACESETUP:
snprintf(b, sizeof(b), "%pg", bdev);
- ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -749,17 +739,15 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
start = 1;
fallthrough;
case BLKTRACESTOP:
- ret = __blk_trace_startstop(q, start);
+ ret = blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = __blk_trace_remove(q);
+ ret = blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
-
- mutex_unlock(&q->debugfs_mutex);
return ret;
}
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1b8db5aee9d38..adc947587eb81 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -357,17 +357,6 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = {
.arg3_type = ARG_CONST_SIZE,
};
-static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
-{
- if (!capable(CAP_SYS_ADMIN))
- return NULL;
-
- pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!",
- current->comm, task_pid_nr(current));
-
- return &bpf_probe_write_user_proto;
-}
-
#define MAX_TRACE_PRINTK_VARARGS 3
#define BPF_TRACE_PRINTK_SIZE 1024
@@ -619,7 +608,8 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
- u64 flags, struct perf_sample_data *sd)
+ u64 flags, struct perf_raw_record *raw,
+ struct perf_sample_data *sd)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
unsigned int cpu = smp_processor_id();
@@ -644,6 +634,8 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
+ perf_sample_save_raw_data(sd, event, raw);
+
return perf_event_output(event, sd, regs);
}
@@ -687,9 +679,8 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
}
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- err = __bpf_perf_event_output(regs, map, flags, sd);
+ err = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_trace_nest_level);
preempt_enable();
@@ -748,9 +739,8 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
perf_fetch_caller_regs(regs);
perf_sample_data_init(sd, 0, 0);
- perf_sample_save_raw_data(sd, &raw);
- ret = __bpf_perf_event_output(regs, map, flags, sd);
+ ret = __bpf_perf_event_output(regs, map, flags, &raw, sd);
out:
this_cpu_dec(bpf_event_output_nest_level);
preempt_enable();
@@ -853,7 +843,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type, struct task_struc
if (unlikely(is_global_init(task)))
return -EPERM;
- if (irqs_disabled()) {
+ if (!preemptible()) {
/* Do an early check on signal validity. Otherwise,
* the error is lost in deferred irq_work.
*/
@@ -1444,6 +1434,8 @@ late_initcall(bpf_key_sig_kfuncs_init);
static const struct bpf_func_proto *
bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
+ const struct bpf_func_proto *func_proto;
+
switch (func_id) {
case BPF_FUNC_map_lookup_elem:
return &bpf_map_lookup_elem_proto;
@@ -1485,9 +1477,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_read_proto;
case BPF_FUNC_get_prandom_u32:
return &bpf_get_prandom_u32_proto;
- case BPF_FUNC_probe_write_user:
- return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
- NULL : bpf_get_probe_write_proto();
case BPF_FUNC_probe_read_user:
return &bpf_probe_read_user_proto;
case BPF_FUNC_probe_read_kernel:
@@ -1566,7 +1555,22 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
default:
- return bpf_base_func_proto(func_id, prog);
+ break;
+ }
+
+ func_proto = bpf_base_func_proto(func_id, prog);
+ if (func_proto)
+ return func_proto;
+
+ if (!bpf_token_capable(prog->aux->token, CAP_SYS_ADMIN))
+ return NULL;
+
+ switch (func_id) {
+ case BPF_FUNC_probe_write_user:
+ return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ?
+ NULL : &bpf_probe_write_user_proto;
+ default:
+ return NULL;
}
}
@@ -2242,6 +2246,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
{
struct bpf_prog_array *old_array;
struct bpf_prog_array *new_array;
+ struct bpf_prog *prog = NULL;
int ret;
mutex_lock(&bpf_event_mutex);
@@ -2262,18 +2267,22 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
}
put:
- /*
- * It could be that the bpf_prog is not sleepable (and will be freed
- * via normal RCU), but is called from a point that supports sleepable
- * programs and uses tasks-trace-RCU.
- */
- synchronize_rcu_tasks_trace();
-
- bpf_prog_put(event->prog);
+ prog = event->prog;
event->prog = NULL;
unlock:
mutex_unlock(&bpf_event_mutex);
+
+ if (prog) {
+ /*
+ * It could be that the bpf_prog is not sleepable (and will be freed
+ * via normal RCU), but is called from a point that supports sleepable
+ * programs and uses tasks-trace-RCU.
+ */
+ synchronize_rcu_tasks_trace();
+
+ bpf_prog_put(prog);
+ }
}
int perf_event_query_prog_array(struct perf_event *event, void __user *info)
@@ -2584,6 +2593,20 @@ struct user_syms {
char *buf;
};
+#ifndef CONFIG_HAVE_FTRACE_REGS_HAVING_PT_REGS
+static DEFINE_PER_CPU(struct pt_regs, bpf_kprobe_multi_pt_regs);
+#define bpf_kprobe_multi_pt_regs_ptr() this_cpu_ptr(&bpf_kprobe_multi_pt_regs)
+#else
+#define bpf_kprobe_multi_pt_regs_ptr() (NULL)
+#endif
+
+static unsigned long ftrace_get_entry_ip(unsigned long fentry_ip)
+{
+ unsigned long ip = ftrace_get_symaddr(fentry_ip);
+
+ return ip ? : fentry_ip;
+}
+
static int copy_user_syms(struct user_syms *us, unsigned long __user *usyms, u32 cnt)
{
unsigned long __user usymbol;
@@ -2778,7 +2801,7 @@ static u64 bpf_kprobe_multi_entry_ip(struct bpf_run_ctx *ctx)
static int
kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
- unsigned long entry_ip, struct pt_regs *regs,
+ unsigned long entry_ip, struct ftrace_regs *fregs,
bool is_return, void *data)
{
struct bpf_kprobe_multi_run_ctx run_ctx = {
@@ -2790,16 +2813,18 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
.entry_ip = entry_ip,
};
struct bpf_run_ctx *old_run_ctx;
+ struct pt_regs *regs;
int err;
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
bpf_prog_inc_misses_counter(link->link.prog);
- err = 0;
+ err = 1;
goto out;
}
migrate_disable();
rcu_read_lock();
+ regs = ftrace_partial_regs(fregs, bpf_kprobe_multi_pt_regs_ptr());
old_run_ctx = bpf_set_run_ctx(&run_ctx.session_ctx.run_ctx);
err = bpf_prog_run(link->link.prog, regs);
bpf_reset_run_ctx(old_run_ctx);
@@ -2813,26 +2838,28 @@ kprobe_multi_link_prog_run(struct bpf_kprobe_multi_link *link,
static int
kprobe_multi_link_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
int err;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- err = kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, false, data);
+ err = kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, false, data);
return is_kprobe_session(link->link.prog) ? err : 0;
}
static void
kprobe_multi_link_exit_handler(struct fprobe *fp, unsigned long fentry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *data)
{
struct bpf_kprobe_multi_link *link;
link = container_of(fp, struct bpf_kprobe_multi_link, fp);
- kprobe_multi_link_prog_run(link, get_entry_ip(fentry_ip), regs, true, data);
+ kprobe_multi_link_prog_run(link, ftrace_get_entry_ip(fentry_ip),
+ fregs, true, data);
}
static int symbols_cmp_r(const void *a, const void *b, const void *priv)
diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c
index 0bf78517b5d4c..5dddfc2149f62 100644
--- a/kernel/trace/fgraph.c
+++ b/kernel/trace/fgraph.c
@@ -292,13 +292,15 @@ static inline unsigned long make_data_type_val(int idx, int size, int offset)
}
/* ftrace_graph_entry set to this to tell some archs to run function graph */
-static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops)
+static int entry_run(struct ftrace_graph_ent *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
{
return 0;
}
/* ftrace_graph_return set to this to tell some archs to run function graph */
-static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops)
+static void return_run(struct ftrace_graph_ret *trace, struct fgraph_ops *ops,
+ struct ftrace_regs *fregs)
{
}
@@ -520,13 +522,15 @@ int __weak ftrace_disable_ftrace_graph_caller(void)
#endif
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
return 0;
}
static void ftrace_graph_ret_stub(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
}
@@ -644,14 +648,20 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func,
#endif
/* If the caller does not use ftrace, call this function. */
-int function_graph_enter(unsigned long ret, unsigned long func,
- unsigned long frame_pointer, unsigned long *retp)
+int function_graph_enter_regs(unsigned long ret, unsigned long func,
+ unsigned long frame_pointer, unsigned long *retp,
+ struct ftrace_regs *fregs)
{
struct ftrace_graph_ent trace;
unsigned long bitmap = 0;
int offset;
+ int bit;
int i;
+ bit = ftrace_test_recursion_trylock(func, ret);
+ if (bit < 0)
+ return -EBUSY;
+
trace.func = func;
trace.depth = ++current->curr_ret_depth;
@@ -663,7 +673,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
if (static_branch_likely(&fgraph_do_direct)) {
int save_curr_ret_stack = current->curr_ret_stack;
- if (static_call(fgraph_func)(&trace, fgraph_direct_gops))
+ if (static_call(fgraph_func)(&trace, fgraph_direct_gops, fregs))
bitmap |= BIT(fgraph_direct_gops->idx);
else
/* Clear out any saved storage */
@@ -681,7 +691,7 @@ int function_graph_enter(unsigned long ret, unsigned long func,
save_curr_ret_stack = current->curr_ret_stack;
if (ftrace_ops_test(&gops->ops, func, NULL) &&
- gops->entryfunc(&trace, gops))
+ gops->entryfunc(&trace, gops, fregs))
bitmap |= BIT(i);
else
/* Clear out any saved storage */
@@ -697,12 +707,13 @@ int function_graph_enter(unsigned long ret, unsigned long func,
* flag, set that bit always.
*/
set_bitmap(current, offset, bitmap | BIT(0));
-
+ ftrace_test_recursion_unlock(bit);
return 0;
out_ret:
current->curr_ret_stack -= FGRAPH_FRAME_OFFSET + 1;
out:
current->curr_ret_depth--;
+ ftrace_test_recursion_unlock(bit);
return -EBUSY;
}
@@ -792,15 +803,12 @@ static struct notifier_block ftrace_suspend_notifier = {
.notifier_call = ftrace_suspend_notifier_call,
};
-/* fgraph_ret_regs is not defined without CONFIG_FUNCTION_GRAPH_RETVAL */
-struct fgraph_ret_regs;
-
/*
* Send the trace to the ring-buffer.
* @return the original return address.
*/
-static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs,
- unsigned long frame_pointer)
+static inline unsigned long
+__ftrace_return_to_handler(struct ftrace_regs *fregs, unsigned long frame_pointer)
{
struct ftrace_ret_stack *ret_stack;
struct ftrace_graph_ret trace;
@@ -818,9 +826,11 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
return (unsigned long)panic;
}
- trace.rettime = trace_clock_local();
+ if (fregs)
+ ftrace_regs_set_instruction_pointer(fregs, ret);
+
#ifdef CONFIG_FUNCTION_GRAPH_RETVAL
- trace.retval = fgraph_ret_regs_return_value(ret_regs);
+ trace.retval = ftrace_regs_get_return_value(fregs);
#endif
bitmap = get_bitmap_bits(current, offset);
@@ -828,17 +838,17 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
#ifdef CONFIG_HAVE_STATIC_CALL
if (static_branch_likely(&fgraph_do_direct)) {
if (test_bit(fgraph_direct_gops->idx, &bitmap))
- static_call(fgraph_retfunc)(&trace, fgraph_direct_gops);
+ static_call(fgraph_retfunc)(&trace, fgraph_direct_gops, fregs);
} else
#endif
{
for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) {
- struct fgraph_ops *gops = fgraph_array[i];
+ struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]);
if (gops == &fgraph_stub)
continue;
- gops->retfunc(&trace, gops);
+ gops->retfunc(&trace, gops, fregs);
}
}
@@ -855,14 +865,14 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs
}
/*
- * After all architecures have selected HAVE_FUNCTION_GRAPH_RETVAL, we can
- * leave only ftrace_return_to_handler(ret_regs).
+ * After all architecures have selected HAVE_FUNCTION_GRAPH_FREGS, we can
+ * leave only ftrace_return_to_handler(fregs).
*/
-#ifdef CONFIG_HAVE_FUNCTION_GRAPH_RETVAL
-unsigned long ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs)
+#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FREGS
+unsigned long ftrace_return_to_handler(struct ftrace_regs *fregs)
{
- return __ftrace_return_to_handler(ret_regs,
- fgraph_ret_regs_frame_pointer(ret_regs));
+ return __ftrace_return_to_handler(fregs,
+ ftrace_regs_get_frame_pointer(fregs));
}
#else
unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
@@ -1010,7 +1020,8 @@ void ftrace_graph_sleep_time_control(bool enable)
* Simply points to ftrace_stub, but with the proper protocol.
* Defined by the linker script in linux/vmlinux.lds.h
*/
-void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
+void ftrace_stub_graph(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
/* The callbacks that hook a function */
trace_func_graph_ret_t ftrace_graph_return = ftrace_stub_graph;
@@ -1174,7 +1185,8 @@ void ftrace_graph_exit_task(struct task_struct *t)
#ifdef CONFIG_DYNAMIC_FTRACE
static int fgraph_pid_func(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = gops->ops.private;
int pid;
@@ -1188,7 +1200,7 @@ static int fgraph_pid_func(struct ftrace_graph_ent *trace,
return 0;
}
- return gops->saved_func(trace, gops);
+ return gops->saved_func(trace, gops, fregs);
}
void fgraph_update_pid_func(void)
@@ -1215,7 +1227,7 @@ void fgraph_update_pid_func(void)
static int start_graph_tracing(void)
{
unsigned long **ret_stack_list;
- int ret;
+ int ret, cpu;
ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE,
sizeof(*ret_stack_list), GFP_KERNEL);
@@ -1223,6 +1235,12 @@ static int start_graph_tracing(void)
if (!ret_stack_list)
return -ENOMEM;
+ /* The cpu_boot init_task->ret_stack will never be freed */
+ for_each_online_cpu(cpu) {
+ if (!idle_task(cpu)->ret_stack)
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
+ }
+
do {
ret = alloc_retstack_tasklist(ret_stack_list);
} while (ret == -EAGAIN);
diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c
index 9ff0182458408..33082c4e8154e 100644
--- a/kernel/trace/fprobe.c
+++ b/kernel/trace/fprobe.c
@@ -8,98 +8,224 @@
#include <linux/fprobe.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
-#include <linux/rethook.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include <asm/fprobe.h>
+
#include "trace.h"
-struct fprobe_rethook_node {
- struct rethook_node node;
- unsigned long entry_ip;
- unsigned long entry_parent_ip;
- char data[];
-};
+#define FPROBE_IP_HASH_BITS 8
+#define FPROBE_IP_TABLE_SIZE (1 << FPROBE_IP_HASH_BITS)
-static inline void __fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
-{
- struct fprobe_rethook_node *fpr;
- struct rethook_node *rh = NULL;
- struct fprobe *fp;
- void *entry_data = NULL;
- int ret = 0;
+#define FPROBE_HASH_BITS 6
+#define FPROBE_TABLE_SIZE (1 << FPROBE_HASH_BITS)
- fp = container_of(ops, struct fprobe, ops);
+#define SIZE_IN_LONG(x) ((x + sizeof(long) - 1) >> (sizeof(long) == 8 ? 3 : 2))
- if (fp->exit_handler) {
- rh = rethook_try_get(fp->rethook);
- if (!rh) {
- fp->nmissed++;
- return;
- }
- fpr = container_of(rh, struct fprobe_rethook_node, node);
- fpr->entry_ip = ip;
- fpr->entry_parent_ip = parent_ip;
- if (fp->entry_data_size)
- entry_data = fpr->data;
+/*
+ * fprobe_table: hold 'fprobe_hlist::hlist' for checking the fprobe still
+ * exists. The key is the address of fprobe instance.
+ * fprobe_ip_table: hold 'fprobe_hlist::array[*]' for searching the fprobe
+ * instance related to the funciton address. The key is the ftrace IP
+ * address.
+ *
+ * When unregistering the fprobe, fprobe_hlist::fp and fprobe_hlist::array[*].fp
+ * are set NULL and delete those from both hash tables (by hlist_del_rcu).
+ * After an RCU grace period, the fprobe_hlist itself will be released.
+ *
+ * fprobe_table and fprobe_ip_table can be accessed from either
+ * - Normal hlist traversal and RCU add/del under 'fprobe_mutex' is held.
+ * - RCU hlist traversal under disabling preempt
+ */
+static struct hlist_head fprobe_table[FPROBE_TABLE_SIZE];
+static struct hlist_head fprobe_ip_table[FPROBE_IP_TABLE_SIZE];
+static DEFINE_MUTEX(fprobe_mutex);
+
+/*
+ * Find first fprobe in the hlist. It will be iterated twice in the entry
+ * probe, once for correcting the total required size, the second time is
+ * calling back the user handlers.
+ * Thus the hlist in the fprobe_table must be sorted and new probe needs to
+ * be added *before* the first fprobe.
+ */
+static struct fprobe_hlist_node *find_first_fprobe_node(unsigned long ip)
+{
+ struct fprobe_hlist_node *node;
+ struct hlist_head *head;
+
+ head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+ hlist_for_each_entry_rcu(node, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (node->addr == ip)
+ return node;
}
+ return NULL;
+}
+NOKPROBE_SYMBOL(find_first_fprobe_node);
+
+/* Node insertion and deletion requires the fprobe_mutex */
+static void insert_fprobe_node(struct fprobe_hlist_node *node)
+{
+ unsigned long ip = node->addr;
+ struct fprobe_hlist_node *next;
+ struct hlist_head *head;
- if (fp->entry_handler)
- ret = fp->entry_handler(fp, ip, parent_ip, ftrace_get_regs(fregs), entry_data);
+ lockdep_assert_held(&fprobe_mutex);
- /* If entry_handler returns !0, nmissed is not counted. */
- if (rh) {
- if (ret)
- rethook_recycle(rh);
- else
- rethook_hook(rh, ftrace_get_regs(fregs), true);
+ next = find_first_fprobe_node(ip);
+ if (next) {
+ hlist_add_before_rcu(&node->hlist, &next->hlist);
+ return;
}
+ head = &fprobe_ip_table[hash_ptr((void *)ip, FPROBE_IP_HASH_BITS)];
+ hlist_add_head_rcu(&node->hlist, head);
}
-static void fprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+/* Return true if there are synonims */
+static bool delete_fprobe_node(struct fprobe_hlist_node *node)
{
- struct fprobe *fp;
- int bit;
+ lockdep_assert_held(&fprobe_mutex);
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+ WRITE_ONCE(node->fp, NULL);
+ hlist_del_rcu(&node->hlist);
+ return !!find_first_fprobe_node(node->addr);
+}
- /* recursion detection has to go before any traceable function and
- * all functions before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
+/* Check existence of the fprobe */
+static bool is_fprobe_still_exist(struct fprobe *fp)
+{
+ struct hlist_head *head;
+ struct fprobe_hlist *fph;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_for_each_entry_rcu(fph, head, hlist,
+ lockdep_is_held(&fprobe_mutex)) {
+ if (fph->fp == fp)
+ return true;
}
- __fprobe_handler(ip, parent_ip, ops, fregs);
- ftrace_test_recursion_unlock(bit);
+ return false;
+}
+NOKPROBE_SYMBOL(is_fprobe_still_exist);
+
+static int add_fprobe_hash(struct fprobe *fp)
+{
+ struct fprobe_hlist *fph = fp->hlist_array;
+ struct hlist_head *head;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+
+ if (is_fprobe_still_exist(fp))
+ return -EEXIST;
+
+ head = &fprobe_table[hash_ptr(fp, FPROBE_HASH_BITS)];
+ hlist_add_head_rcu(&fp->hlist_array->hlist, head);
+ return 0;
+}
+
+static int del_fprobe_hash(struct fprobe *fp)
+{
+ struct fprobe_hlist *fph = fp->hlist_array;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ if (WARN_ON_ONCE(!fph))
+ return -EINVAL;
+
+ if (!is_fprobe_still_exist(fp))
+ return -ENOENT;
+
+ fph->fp = NULL;
+ hlist_del_rcu(&fph->hlist);
+ return 0;
+}
+
+#ifdef ARCH_DEFINE_ENCODE_FPROBE_HEADER
+
+/* The arch should encode fprobe_header info into one unsigned long */
+#define FPROBE_HEADER_SIZE_IN_LONG 1
+
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD ||
+ !arch_fprobe_header_encodable(fp)))
+ return false;
+ *stack = arch_encode_fprobe_header(fp, size_words);
+ return true;
}
-NOKPROBE_SYMBOL(fprobe_handler);
-static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
- struct ftrace_ops *ops, struct ftrace_regs *fregs)
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
{
+ *fp = arch_decode_fprobe_header_fp(*stack);
+ *size_words = arch_decode_fprobe_header_size(*stack);
+}
+
+#else
+
+/* Generic fprobe_header */
+struct __fprobe_header {
struct fprobe *fp;
- int bit;
+ unsigned long size_words;
+} __packed;
- fp = container_of(ops, struct fprobe, ops);
- if (fprobe_disabled(fp))
- return;
+#define FPROBE_HEADER_SIZE_IN_LONG SIZE_IN_LONG(sizeof(struct __fprobe_header))
- /* recursion detection has to go before any traceable function and
- * all functions called before this point should be marked as notrace
- */
- bit = ftrace_test_recursion_trylock(ip, parent_ip);
- if (bit < 0) {
- fp->nmissed++;
- return;
- }
+static inline bool write_fprobe_header(unsigned long *stack,
+ struct fprobe *fp, unsigned int size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ if (WARN_ON_ONCE(size_words > MAX_FPROBE_DATA_SIZE_WORD))
+ return false;
+
+ fph->fp = fp;
+ fph->size_words = size_words;
+ return true;
+}
+
+static inline void read_fprobe_header(unsigned long *stack,
+ struct fprobe **fp, unsigned int *size_words)
+{
+ struct __fprobe_header *fph = (struct __fprobe_header *)stack;
+
+ *fp = fph->fp;
+ *size_words = fph->size_words;
+}
+
+#endif
+
+/*
+ * fprobe shadow stack management:
+ * Since fprobe shares a single fgraph_ops, it needs to share the stack entry
+ * among the probes on the same function exit. Note that a new probe can be
+ * registered before a target function is returning, we can not use the hash
+ * table to find the corresponding probes. Thus the probe address is stored on
+ * the shadow stack with its entry data size.
+ *
+ */
+static inline int __fprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ if (!fp->entry_handler)
+ return 0;
+ return fp->entry_handler(fp, ip, parent_ip, fregs, data);
+}
+
+static inline int __fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
+ struct fprobe *fp, struct ftrace_regs *fregs,
+ void *data)
+{
+ int ret;
/*
* This user handler is shared with other kprobes and is not expected to be
* called recursively. So if any other kprobe handler is running, this will
@@ -108,44 +234,182 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip,
*/
if (unlikely(kprobe_running())) {
fp->nmissed++;
- goto recursion_unlock;
+ return 0;
}
kprobe_busy_begin();
- __fprobe_handler(ip, parent_ip, ops, fregs);
+ ret = __fprobe_handler(ip, parent_ip, fp, fregs, data);
kprobe_busy_end();
-
-recursion_unlock:
- ftrace_test_recursion_unlock(bit);
+ return ret;
}
-static void fprobe_exit_handler(struct rethook_node *rh, void *data,
- unsigned long ret_ip, struct pt_regs *regs)
+static int fprobe_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
- struct fprobe *fp = (struct fprobe *)data;
- struct fprobe_rethook_node *fpr;
- int bit;
+ struct fprobe_hlist_node *node, *first;
+ unsigned long *fgraph_data = NULL;
+ unsigned long func = trace->func;
+ unsigned long ret_ip;
+ int reserved_words;
+ struct fprobe *fp;
+ int used, ret;
- if (!fp || fprobe_disabled(fp))
- return;
+ if (WARN_ON_ONCE(!fregs))
+ return 0;
+
+ first = node = find_first_fprobe_node(func);
+ if (unlikely(!first))
+ return 0;
- fpr = container_of(rh, struct fprobe_rethook_node, node);
+ reserved_words = 0;
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (!fp || !fp->exit_handler)
+ continue;
+ /*
+ * Since fprobe can be enabled until the next loop, we ignore the
+ * fprobe's disabled flag in this loop.
+ */
+ reserved_words +=
+ FPROBE_HEADER_SIZE_IN_LONG + SIZE_IN_LONG(fp->entry_data_size);
+ }
+ node = first;
+ if (reserved_words) {
+ fgraph_data = fgraph_reserve_data(gops->idx, reserved_words * sizeof(long));
+ if (unlikely(!fgraph_data)) {
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (fp && !fprobe_disabled(fp))
+ fp->nmissed++;
+ }
+ return 0;
+ }
+ }
/*
- * we need to assure no calls to traceable functions in-between the
- * end of fprobe_handler and the beginning of fprobe_exit_handler.
+ * TODO: recursion detection has been done in the fgraph. Thus we need
+ * to add a callback to increment missed counter.
*/
- bit = ftrace_test_recursion_trylock(fpr->entry_ip, fpr->entry_parent_ip);
- if (bit < 0) {
- fp->nmissed++;
+ ret_ip = ftrace_regs_get_return_address(fregs);
+ used = 0;
+ hlist_for_each_entry_from_rcu(node, hlist) {
+ int data_size;
+ void *data;
+
+ if (node->addr != func)
+ break;
+ fp = READ_ONCE(node->fp);
+ if (!fp || fprobe_disabled(fp))
+ continue;
+
+ data_size = fp->entry_data_size;
+ if (data_size && fp->exit_handler)
+ data = fgraph_data + used + FPROBE_HEADER_SIZE_IN_LONG;
+ else
+ data = NULL;
+
+ if (fprobe_shared_with_kprobes(fp))
+ ret = __fprobe_kprobe_handler(func, ret_ip, fp, fregs, data);
+ else
+ ret = __fprobe_handler(func, ret_ip, fp, fregs, data);
+
+ /* If entry_handler returns !0, nmissed is not counted but skips exit_handler. */
+ if (!ret && fp->exit_handler) {
+ int size_words = SIZE_IN_LONG(data_size);
+
+ if (write_fprobe_header(&fgraph_data[used], fp, size_words))
+ used += FPROBE_HEADER_SIZE_IN_LONG + size_words;
+ }
+ }
+ if (used < reserved_words)
+ memset(fgraph_data + used, 0, reserved_words - used);
+
+ /* If any exit_handler is set, data must be used. */
+ return used != 0;
+}
+NOKPROBE_SYMBOL(fprobe_entry);
+
+static void fprobe_return(struct ftrace_graph_ret *trace,
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
+{
+ unsigned long *fgraph_data = NULL;
+ unsigned long ret_ip;
+ struct fprobe *fp;
+ int size, curr;
+ int size_words;
+
+ fgraph_data = (unsigned long *)fgraph_retrieve_data(gops->idx, &size);
+ if (WARN_ON_ONCE(!fgraph_data))
return;
+ size_words = SIZE_IN_LONG(size);
+ ret_ip = ftrace_regs_get_instruction_pointer(fregs);
+
+ preempt_disable();
+
+ curr = 0;
+ while (size_words > curr) {
+ read_fprobe_header(&fgraph_data[curr], &fp, &size);
+ if (!fp)
+ break;
+ curr += FPROBE_HEADER_SIZE_IN_LONG;
+ if (is_fprobe_still_exist(fp) && !fprobe_disabled(fp)) {
+ if (WARN_ON_ONCE(curr + size > size_words))
+ break;
+ fp->exit_handler(fp, trace->func, ret_ip, fregs,
+ size ? fgraph_data + curr : NULL);
+ }
+ curr += size;
+ }
+ preempt_enable();
+}
+NOKPROBE_SYMBOL(fprobe_return);
+
+static struct fgraph_ops fprobe_graph_ops = {
+ .entryfunc = fprobe_entry,
+ .retfunc = fprobe_return,
+};
+static int fprobe_graph_active;
+
+/* Add @addrs to the ftrace filter and register fgraph if needed. */
+static int fprobe_graph_add_ips(unsigned long *addrs, int num)
+{
+ int ret;
+
+ lockdep_assert_held(&fprobe_mutex);
+
+ ret = ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 0, 0);
+ if (ret)
+ return ret;
+
+ if (!fprobe_graph_active) {
+ ret = register_ftrace_graph(&fprobe_graph_ops);
+ if (WARN_ON_ONCE(ret)) {
+ ftrace_free_filter(&fprobe_graph_ops.ops);
+ return ret;
+ }
}
+ fprobe_graph_active++;
+ return 0;
+}
- fp->exit_handler(fp, fpr->entry_ip, ret_ip, regs,
- fp->entry_data_size ? (void *)fpr->data : NULL);
- ftrace_test_recursion_unlock(bit);
+/* Remove @addrs from the ftrace filter and unregister fgraph if possible. */
+static void fprobe_graph_remove_ips(unsigned long *addrs, int num)
+{
+ lockdep_assert_held(&fprobe_mutex);
+
+ fprobe_graph_active--;
+ /* Q: should we unregister it ? */
+ if (!fprobe_graph_active)
+ unregister_ftrace_graph(&fprobe_graph_ops);
+
+ if (num)
+ ftrace_set_filter_ips(&fprobe_graph_ops.ops, addrs, num, 1, 0);
}
-NOKPROBE_SYMBOL(fprobe_exit_handler);
static int symbols_cmp(const void *a, const void *b)
{
@@ -175,53 +439,97 @@ static unsigned long *get_ftrace_locations(const char **syms, int num)
return ERR_PTR(-ENOENT);
}
-static void fprobe_init(struct fprobe *fp)
-{
- fp->nmissed = 0;
- if (fprobe_shared_with_kprobes(fp))
- fp->ops.func = fprobe_kprobe_handler;
- else
- fp->ops.func = fprobe_handler;
- fp->ops.flags |= FTRACE_OPS_FL_SAVE_REGS;
-}
+struct filter_match_data {
+ const char *filter;
+ const char *notfilter;
+ size_t index;
+ size_t size;
+ unsigned long *addrs;
+};
-static int fprobe_init_rethook(struct fprobe *fp, int num)
+static int filter_match_callback(void *data, const char *name, unsigned long addr)
{
- int size;
+ struct filter_match_data *match = data;
- if (!fp->exit_handler) {
- fp->rethook = NULL;
+ if (!glob_match(match->filter, name) ||
+ (match->notfilter && glob_match(match->notfilter, name)))
return 0;
- }
- /* Initialize rethook if needed */
- if (fp->nr_maxactive)
- num = fp->nr_maxactive;
- else
- num *= num_possible_cpus() * 2;
- if (num <= 0)
- return -EINVAL;
+ if (!ftrace_location(addr))
+ return 0;
+
+ if (match->addrs)
+ match->addrs[match->index] = addr;
- size = sizeof(struct fprobe_rethook_node) + fp->entry_data_size;
+ match->index++;
+ return match->index == match->size;
+}
- /* Initialize rethook */
- fp->rethook = rethook_alloc((void *)fp, fprobe_exit_handler, size, num);
- if (IS_ERR(fp->rethook))
- return PTR_ERR(fp->rethook);
+/*
+ * Make IP list from the filter/no-filter glob patterns.
+ * Return the number of matched symbols, or -ENOENT.
+ */
+static int ip_list_from_filter(const char *filter, const char *notfilter,
+ unsigned long *addrs, size_t size)
+{
+ struct filter_match_data match = { .filter = filter, .notfilter = notfilter,
+ .index = 0, .size = size, .addrs = addrs};
+ int ret;
- return 0;
+ ret = kallsyms_on_each_symbol(filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+ ret = module_kallsyms_on_each_symbol(NULL, filter_match_callback, &match);
+ if (ret < 0)
+ return ret;
+
+ return match.index ?: -ENOENT;
}
static void fprobe_fail_cleanup(struct fprobe *fp)
{
- if (!IS_ERR_OR_NULL(fp->rethook)) {
- /* Don't need to cleanup rethook->handler because this is not used. */
- rethook_free(fp->rethook);
- fp->rethook = NULL;
+ kfree(fp->hlist_array);
+ fp->hlist_array = NULL;
+}
+
+/* Initialize the fprobe data structure. */
+static int fprobe_init(struct fprobe *fp, unsigned long *addrs, int num)
+{
+ struct fprobe_hlist *hlist_array;
+ unsigned long addr;
+ int size, i;
+
+ if (!fp || !addrs || num <= 0)
+ return -EINVAL;
+
+ size = ALIGN(fp->entry_data_size, sizeof(long));
+ if (size > MAX_FPROBE_DATA_SIZE)
+ return -E2BIG;
+ fp->entry_data_size = size;
+
+ hlist_array = kzalloc(struct_size(hlist_array, array, num), GFP_KERNEL);
+ if (!hlist_array)
+ return -ENOMEM;
+
+ fp->nmissed = 0;
+
+ hlist_array->size = num;
+ fp->hlist_array = hlist_array;
+ hlist_array->fp = fp;
+ for (i = 0; i < num; i++) {
+ hlist_array->array[i].fp = fp;
+ addr = ftrace_location(addrs[i]);
+ if (!addr) {
+ fprobe_fail_cleanup(fp);
+ return -ENOENT;
+ }
+ hlist_array->array[i].addr = addr;
}
- ftrace_free_filter(&fp->ops);
+ return 0;
}
+#define FPROBE_IPS_MAX INT_MAX
+
/**
* register_fprobe() - Register fprobe to ftrace by pattern.
* @fp: A fprobe data structure to be registered.
@@ -235,46 +543,24 @@ static void fprobe_fail_cleanup(struct fprobe *fp)
*/
int register_fprobe(struct fprobe *fp, const char *filter, const char *notfilter)
{
- struct ftrace_hash *hash;
- unsigned char *str;
- int ret, len;
+ unsigned long *addrs;
+ int ret;
if (!fp || !filter)
return -EINVAL;
- fprobe_init(fp);
-
- len = strlen(filter);
- str = kstrdup(filter, GFP_KERNEL);
- ret = ftrace_set_filter(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
+ ret = ip_list_from_filter(filter, notfilter, NULL, FPROBE_IPS_MAX);
+ if (ret < 0)
return ret;
- if (notfilter) {
- len = strlen(notfilter);
- str = kstrdup(notfilter, GFP_KERNEL);
- ret = ftrace_set_notrace(&fp->ops, str, len, 0);
- kfree(str);
- if (ret)
- goto out;
- }
-
- /* TODO:
- * correctly calculate the total number of filtered symbols
- * from both filter and notfilter.
- */
- hash = rcu_access_pointer(fp->ops.local_hash.filter_hash);
- if (WARN_ON_ONCE(!hash))
- goto out;
-
- ret = fprobe_init_rethook(fp, (int)hash->count);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
+ addrs = kcalloc(ret, sizeof(unsigned long), GFP_KERNEL);
+ if (!addrs)
+ return -ENOMEM;
+ ret = ip_list_from_filter(filter, notfilter, addrs, ret);
+ if (ret > 0)
+ ret = register_fprobe_ips(fp, addrs, ret);
-out:
- if (ret)
- fprobe_fail_cleanup(fp);
+ kfree(addrs);
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe);
@@ -282,7 +568,7 @@ EXPORT_SYMBOL_GPL(register_fprobe);
/**
* register_fprobe_ips() - Register fprobe to ftrace by address.
* @fp: A fprobe data structure to be registered.
- * @addrs: An array of target ftrace location addresses.
+ * @addrs: An array of target function address.
* @num: The number of entries of @addrs.
*
* Register @fp to ftrace for enabling the probe on the address given by @addrs.
@@ -294,23 +580,27 @@ EXPORT_SYMBOL_GPL(register_fprobe);
*/
int register_fprobe_ips(struct fprobe *fp, unsigned long *addrs, int num)
{
- int ret;
-
- if (!fp || !addrs || num <= 0)
- return -EINVAL;
-
- fprobe_init(fp);
+ struct fprobe_hlist *hlist_array;
+ int ret, i;
- ret = ftrace_set_filter_ips(&fp->ops, addrs, num, 0, 0);
+ ret = fprobe_init(fp, addrs, num);
if (ret)
return ret;
- ret = fprobe_init_rethook(fp, num);
- if (!ret)
- ret = register_ftrace_function(&fp->ops);
+ mutex_lock(&fprobe_mutex);
+
+ hlist_array = fp->hlist_array;
+ ret = fprobe_graph_add_ips(addrs, num);
+ if (!ret) {
+ add_fprobe_hash(fp);
+ for (i = 0; i < hlist_array->size; i++)
+ insert_fprobe_node(&hlist_array->array[i]);
+ }
+ mutex_unlock(&fprobe_mutex);
if (ret)
fprobe_fail_cleanup(fp);
+
return ret;
}
EXPORT_SYMBOL_GPL(register_fprobe_ips);
@@ -348,14 +638,13 @@ EXPORT_SYMBOL_GPL(register_fprobe_syms);
bool fprobe_is_registered(struct fprobe *fp)
{
- if (!fp || (fp->ops.saved_func != fprobe_handler &&
- fp->ops.saved_func != fprobe_kprobe_handler))
+ if (!fp || !fp->hlist_array)
return false;
return true;
}
/**
- * unregister_fprobe() - Unregister fprobe from ftrace
+ * unregister_fprobe() - Unregister fprobe.
* @fp: A fprobe data structure to be unregistered.
*
* Unregister fprobe (and remove ftrace hooks from the function entries).
@@ -364,23 +653,40 @@ bool fprobe_is_registered(struct fprobe *fp)
*/
int unregister_fprobe(struct fprobe *fp)
{
- int ret;
+ struct fprobe_hlist *hlist_array;
+ unsigned long *addrs = NULL;
+ int ret = 0, i, count;
- if (!fprobe_is_registered(fp))
- return -EINVAL;
+ mutex_lock(&fprobe_mutex);
+ if (!fp || !is_fprobe_still_exist(fp)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ hlist_array = fp->hlist_array;
+ addrs = kcalloc(hlist_array->size, sizeof(unsigned long), GFP_KERNEL);
+ if (!addrs) {
+ ret = -ENOMEM; /* TODO: Fallback to one-by-one loop */
+ goto out;
+ }
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_stop(fp->rethook);
+ /* Remove non-synonim ips from table and hash */
+ count = 0;
+ for (i = 0; i < hlist_array->size; i++) {
+ if (!delete_fprobe_node(&hlist_array->array[i]))
+ addrs[count++] = hlist_array->array[i].addr;
+ }
+ del_fprobe_hash(fp);
- ret = unregister_ftrace_function(&fp->ops);
- if (ret < 0)
- return ret;
+ fprobe_graph_remove_ips(addrs, count);
- if (!IS_ERR_OR_NULL(fp->rethook))
- rethook_free(fp->rethook);
+ kfree_rcu(hlist_array, rcu);
+ fp->hlist_array = NULL;
- ftrace_free_filter(&fp->ops);
+out:
+ mutex_unlock(&fprobe_mutex);
+ kfree(addrs);
return ret;
}
EXPORT_SYMBOL_GPL(unregister_fprobe);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9b17efb1a87dd..6b0c25761ccb1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -536,24 +536,21 @@ static int function_stat_show(struct seq_file *m, void *v)
{
struct ftrace_profile *rec = v;
char str[KSYM_SYMBOL_LEN];
- int ret = 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static struct trace_seq s;
unsigned long long avg;
unsigned long long stddev;
#endif
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
/* we raced with function_profile_reset() */
- if (unlikely(rec->counter == 0)) {
- ret = -EBUSY;
- goto out;
- }
+ if (unlikely(rec->counter == 0))
+ return -EBUSY;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
avg = div64_ul(rec->time, rec->counter);
if (tracing_thresh && (avg < tracing_thresh))
- goto out;
+ return 0;
#endif
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -590,10 +587,8 @@ static int function_stat_show(struct seq_file *m, void *v)
trace_print_seq(m, &s);
#endif
seq_putc(m, '\n');
-out:
- mutex_unlock(&ftrace_profile_lock);
- return ret;
+ return 0;
}
static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -789,27 +784,24 @@ function_profile_call(unsigned long ip, unsigned long parent_ip,
{
struct ftrace_profile_stat *stat;
struct ftrace_profile *rec;
- unsigned long flags;
if (!ftrace_profile_enabled)
return;
- local_irq_save(flags);
+ guard(preempt_notrace)();
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
rec = ftrace_find_profiled_func(stat, ip);
if (!rec) {
rec = ftrace_profile_alloc(stat, ip);
if (!rec)
- goto out;
+ return;
}
rec->counter++;
- out:
- local_irq_restore(flags);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -827,7 +819,8 @@ struct profile_fgraph_data {
};
static int profile_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct profile_fgraph_data *profile_data;
@@ -849,26 +842,27 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace,
}
static void profile_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct profile_fgraph_data *profile_data;
struct ftrace_profile_stat *stat;
unsigned long long calltime;
unsigned long long rettime = trace_clock_local();
struct ftrace_profile *rec;
- unsigned long flags;
int size;
- local_irq_save(flags);
+ guard(preempt_notrace)();
+
stat = this_cpu_ptr(&ftrace_profile_stats);
if (!stat->hash || !ftrace_profile_enabled)
- goto out;
+ return;
profile_data = fgraph_retrieve_data(gops->idx, &size);
/* If the calltime was zero'd ignore it */
if (!profile_data || !profile_data->calltime)
- goto out;
+ return;
calltime = rettime - profile_data->calltime;
@@ -896,22 +890,16 @@ static void profile_graph_return(struct ftrace_graph_ret *trace,
rec->time += calltime;
rec->time_squared += calltime * calltime;
}
-
- out:
- local_irq_restore(flags);
}
static struct fgraph_ops fprofiler_ops = {
- .ops = {
- .flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(fprofiler_ops.ops)
- },
.entryfunc = &profile_graph_entry,
.retfunc = &profile_graph_return,
};
static int register_ftrace_profiler(void)
{
+ ftrace_ops_set_global_filter(&fprofiler_ops.ops);
return register_ftrace_graph(&fprofiler_ops);
}
@@ -922,12 +910,11 @@ static void unregister_ftrace_profiler(void)
#else
static struct ftrace_ops ftrace_profile_ops __read_mostly = {
.func = function_profile_call,
- .flags = FTRACE_OPS_FL_INITIALIZED,
- INIT_OPS_HASH(ftrace_profile_ops)
};
static int register_ftrace_profiler(void)
{
+ ftrace_ops_set_global_filter(&ftrace_profile_ops);
return register_ftrace_function(&ftrace_profile_ops);
}
@@ -950,20 +937,16 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
val = !!val;
- mutex_lock(&ftrace_profile_lock);
+ guard(mutex)(&ftrace_profile_lock);
if (ftrace_profile_enabled ^ val) {
if (val) {
ret = ftrace_profile_init();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ret = register_ftrace_profiler();
- if (ret < 0) {
- cnt = ret;
- goto out;
- }
+ if (ret < 0)
+ return ret;
ftrace_profile_enabled = 1;
} else {
ftrace_profile_enabled = 0;
@@ -974,8 +957,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
unregister_ftrace_profiler();
}
}
- out:
- mutex_unlock(&ftrace_profile_lock);
*ppos += cnt;
@@ -1675,14 +1656,12 @@ unsigned long ftrace_location(unsigned long ip)
loc = ftrace_location_range(ip, ip);
if (!loc) {
if (!kallsyms_lookup_size_offset(ip, &size, &offset))
- goto out;
+ return 0;
/* map sym+0 to __fentry__ */
if (!offset)
loc = ftrace_location_range(ip, ip + size - 1);
}
-
-out:
return loc;
}
@@ -2077,7 +2056,7 @@ rollback:
continue;
if (rec == end)
- goto err_out;
+ return -EBUSY;
in_old = !!ftrace_lookup_ip(old_hash, rec->ip);
in_new = !!ftrace_lookup_ip(new_hash, rec->ip);
@@ -2090,7 +2069,6 @@ rollback:
rec->flags |= FTRACE_FL_IPMODIFY;
} while_for_each_ftrace_rec();
-err_out:
return -EBUSY;
}
@@ -3242,15 +3220,22 @@ static struct ftrace_hash *copy_hash(struct ftrace_hash *src)
* The filter_hash updates uses just the append_hash() function
* and the notrace_hash does not.
*/
-static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash)
+static int append_hash(struct ftrace_hash **hash, struct ftrace_hash *new_hash,
+ int size_bits)
{
struct ftrace_func_entry *entry;
int size;
int i;
- /* An empty hash does everything */
- if (ftrace_hash_empty(*hash))
- return 0;
+ if (*hash) {
+ /* An empty hash does everything */
+ if (ftrace_hash_empty(*hash))
+ return 0;
+ } else {
+ *hash = alloc_ftrace_hash(size_bits);
+ if (!*hash)
+ return -ENOMEM;
+ }
/* If new_hash has everything make hash have everything */
if (ftrace_hash_empty(new_hash)) {
@@ -3314,16 +3299,18 @@ static int intersect_hash(struct ftrace_hash **hash, struct ftrace_hash *new_has
/* Return a new hash that has a union of all @ops->filter_hash entries */
static struct ftrace_hash *append_hashes(struct ftrace_ops *ops)
{
- struct ftrace_hash *new_hash;
+ struct ftrace_hash *new_hash = NULL;
struct ftrace_ops *subops;
+ int size_bits;
int ret;
- new_hash = alloc_ftrace_hash(ops->func_hash->filter_hash->size_bits);
- if (!new_hash)
- return NULL;
+ if (ops->func_hash->filter_hash)
+ size_bits = ops->func_hash->filter_hash->size_bits;
+ else
+ size_bits = FTRACE_HASH_DEFAULT_BITS;
list_for_each_entry(subops, &ops->subop_list, list) {
- ret = append_hash(&new_hash, subops->func_hash->filter_hash);
+ ret = append_hash(&new_hash, subops->func_hash->filter_hash, size_bits);
if (ret < 0) {
free_ftrace_hash(new_hash);
return NULL;
@@ -3332,7 +3319,8 @@ static struct ftrace_hash *append_hashes(struct ftrace_ops *ops)
if (ftrace_hash_empty(new_hash))
break;
}
- return new_hash;
+ /* Can't return NULL as that means this failed */
+ return new_hash ? : EMPTY_HASH;
}
/* Make @ops trace evenything except what all its subops do not trace */
@@ -3527,7 +3515,8 @@ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int
filter_hash = alloc_and_copy_ftrace_hash(size_bits, ops->func_hash->filter_hash);
if (!filter_hash)
return -ENOMEM;
- ret = append_hash(&filter_hash, subops->func_hash->filter_hash);
+ ret = append_hash(&filter_hash, subops->func_hash->filter_hash,
+ size_bits);
if (ret < 0) {
free_ftrace_hash(filter_hash);
return ret;
@@ -4930,23 +4919,6 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
return __ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable);
}
-static bool module_exists(const char *module)
-{
- /* All modules have the symbol __this_module */
- static const char this_mod[] = "__this_module";
- char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
- unsigned long val;
- int n;
-
- n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
-
- if (n > sizeof(modname) - 1)
- return false;
-
- val = module_kallsyms_lookup_name(modname);
- return val != 0;
-}
-
static int cache_mod(struct trace_array *tr,
const char *func, char *module, int enable)
{
@@ -4986,10 +4958,6 @@ static int cache_mod(struct trace_array *tr,
return ftrace_add_mod(tr, func, module, enable);
}
-static int
-ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
- int reset, int enable);
-
#ifdef CONFIG_MODULES
static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
char *mod, bool enable)
@@ -5619,20 +5587,15 @@ static DEFINE_MUTEX(ftrace_cmd_mutex);
__init int register_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p;
- int ret = 0;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &ftrace_commands);
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -5642,20 +5605,17 @@ __init int register_ftrace_command(struct ftrace_func_command *cmd)
__init int unregister_ftrace_command(struct ftrace_func_command *cmd)
{
struct ftrace_func_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry_safe(p, n, &ftrace_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -ENODEV;
}
static int ftrace_process_regex(struct ftrace_iterator *iter,
@@ -5665,7 +5625,7 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
struct trace_array *tr = iter->ops->private;
char *func, *command, *next = buff;
struct ftrace_func_command *p;
- int ret = -EINVAL;
+ int ret;
func = strsep(&next, ":");
@@ -5682,17 +5642,14 @@ static int ftrace_process_regex(struct ftrace_iterator *iter,
command = strsep(&next, ":");
- mutex_lock(&ftrace_cmd_mutex);
+ guard(mutex)(&ftrace_cmd_mutex);
+
list_for_each_entry(p, &ftrace_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->func(tr, hash, func, command, next, enable);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->func(tr, hash, func, command, next, enable);
}
- out_unlock:
- mutex_unlock(&ftrace_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t
@@ -5726,12 +5683,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
parser->idx, enable);
trace_parser_clear(parser);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = read;
- out:
- return ret;
+ return read;
}
ssize_t
@@ -5763,6 +5718,9 @@ __ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return -ENOENT;
free_hash_entry(hash, entry);
return 0;
+ } else if (__ftrace_lookup_ip(hash, ip) != NULL) {
+ /* Already exists */
+ return 0;
}
entry = add_hash_entry(hash, ip);
@@ -5792,7 +5750,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long *ips,
static int
ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long *ips, unsigned int cnt,
- int remove, int reset, int enable)
+ int remove, int reset, int enable, char *mod)
{
struct ftrace_hash **orig_hash;
struct ftrace_hash *hash;
@@ -5818,7 +5776,15 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
goto out_regex_unlock;
}
- if (buf && !ftrace_match_records(hash, buf, len)) {
+ if (buf && !match_records(hash, buf, len, mod)) {
+ /* If this was for a module and nothing was enabled, flag it */
+ if (mod)
+ (*orig_hash)->flags |= FTRACE_HASH_FL_MOD;
+
+ /*
+ * Even if it is a mod, return error to let caller know
+ * nothing was added
+ */
ret = -EINVAL;
goto out_regex_unlock;
}
@@ -5843,7 +5809,7 @@ static int
ftrace_set_addr(struct ftrace_ops *ops, unsigned long *ips, unsigned int cnt,
int remove, int reset, int enable)
{
- return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable);
+ return ftrace_set_hash(ops, NULL, 0, ips, cnt, remove, reset, enable, NULL);
}
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
@@ -6221,7 +6187,38 @@ static int
ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
int reset, int enable)
{
- return ftrace_set_hash(ops, buf, len, NULL, 0, 0, reset, enable);
+ char *mod = NULL, *func, *command, *next = buf;
+ char *tmp __free(kfree) = NULL;
+ struct trace_array *tr = ops->private;
+ int ret;
+
+ func = strsep(&next, ":");
+
+ /* This can also handle :mod: parsing */
+ if (next) {
+ if (!tr)
+ return -EINVAL;
+
+ command = strsep(&next, ":");
+ if (strcmp(command, "mod") != 0)
+ return -EINVAL;
+
+ mod = next;
+ len = command - func;
+ /* Save the original func as ftrace_set_hash() can modify it */
+ tmp = kstrdup(func, GFP_KERNEL);
+ }
+
+ ret = ftrace_set_hash(ops, func, len, NULL, 0, 0, reset, enable, mod);
+
+ if (tr && mod && ret < 0) {
+ /* Did tmp fail to allocate? */
+ if (!tmp)
+ return -ENOMEM;
+ ret = cache_mod(tr, tmp, mod, enable);
+ }
+
+ return ret;
}
/**
@@ -6385,6 +6382,14 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
ftrace_ops_init(ops);
+ /* The trace_array is needed for caching module function filters */
+ if (!ops->private) {
+ struct trace_array *tr = trace_get_global_array();
+
+ ops->private = tr;
+ ftrace_init_trace_array(tr);
+ }
+
while (buf) {
func = strsep(&buf, ",");
ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -7818,9 +7823,14 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
void ftrace_init_trace_array(struct trace_array *tr)
{
+ if (tr->flags & TRACE_ARRAY_FL_MOD_INIT)
+ return;
+
INIT_LIST_HEAD(&tr->func_probes);
INIT_LIST_HEAD(&tr->mod_trace);
INIT_LIST_HEAD(&tr->mod_notrace);
+
+ tr->flags |= TRACE_ARRAY_FL_MOD_INIT;
}
#else
@@ -7849,7 +7859,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
__init void ftrace_init_global_array_ops(struct trace_array *tr)
{
tr->ops = &global_ops;
- tr->ops->private = tr;
+ if (!global_ops.private)
+ global_ops.private = tr;
ftrace_init_trace_array(tr);
init_array_fgraph_ops(tr, tr->ops);
}
@@ -8291,7 +8302,7 @@ pid_write(struct file *filp, const char __user *ubuf,
if (!cnt)
return 0;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
switch (type) {
case TRACE_PIDS:
@@ -8307,14 +8318,13 @@ pid_write(struct file *filp, const char __user *ubuf,
lockdep_is_held(&ftrace_lock));
break;
default:
- ret = -EINVAL;
WARN_ON_ONCE(1);
- goto out;
+ return -EINVAL;
}
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
switch (type) {
case TRACE_PIDS:
@@ -8343,11 +8353,8 @@ pid_write(struct file *filp, const char __user *ubuf,
ftrace_update_pid_func();
ftrace_startup_all(0);
- out:
- mutex_unlock(&ftrace_lock);
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -8750,17 +8757,17 @@ static int
ftrace_enable_sysctl(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- int ret = -ENODEV;
+ int ret;
- mutex_lock(&ftrace_lock);
+ guard(mutex)(&ftrace_lock);
if (unlikely(ftrace_disabled))
- goto out;
+ return -ENODEV;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
- goto out;
+ return ret;
if (ftrace_enabled) {
@@ -8774,8 +8781,7 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write,
} else {
if (is_permanent_ops_registered()) {
ftrace_enabled = true;
- ret = -EBUSY;
- goto out;
+ return -EBUSY;
}
/* stopping ftrace calls (just send to ftrace_stub) */
@@ -8785,12 +8791,10 @@ ftrace_enable_sysctl(const struct ctl_table *table, int write,
}
last_ftrace_enabled = !!ftrace_enabled;
- out:
- mutex_unlock(&ftrace_lock);
- return ret;
+ return 0;
}
-static struct ctl_table ftrace_sysctls[] = {
+static const struct ctl_table ftrace_sysctls[] = {
{
.procname = "ftrace_enabled",
.data = &ftrace_enabled,
diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c
index 4966e6bbdf6f3..c62b9b3cfb3d8 100644
--- a/kernel/trace/pid_list.c
+++ b/kernel/trace/pid_list.c
@@ -414,7 +414,7 @@ struct trace_pid_list *trace_pid_list_alloc(void)
int i;
/* According to linux/thread.h, pids can be no bigger that 30 bits */
- WARN_ON_ONCE(pid_max > (1 << 30));
+ WARN_ON_ONCE(init_pid_ns.pid_max > (1 << 30));
pid_list = kzalloc(sizeof(*pid_list), GFP_KERNEL);
if (!pid_list)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7e257e855dd19..bb6089c2951e5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1672,7 +1672,8 @@ static void *rb_range_buffer(struct ring_buffer_per_cpu *cpu_buffer, int idx)
* must be the same.
*/
static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
- struct trace_buffer *buffer, int nr_pages)
+ struct trace_buffer *buffer, int nr_pages,
+ unsigned long *subbuf_mask)
{
int subbuf_size = PAGE_SIZE;
struct buffer_data_page *subbuf;
@@ -1680,6 +1681,9 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
unsigned long buffers_end;
int i;
+ if (!subbuf_mask)
+ return false;
+
/* Check the meta magic and meta struct size */
if (meta->magic != RING_BUFFER_META_MAGIC ||
meta->struct_size != sizeof(*meta)) {
@@ -1712,6 +1716,8 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
subbuf = rb_subbufs_from_meta(meta);
+ bitmap_clear(subbuf_mask, 0, meta->nr_subbufs);
+
/* Is the meta buffers and the subbufs themselves have correct data? */
for (i = 0; i < meta->nr_subbufs; i++) {
if (meta->buffers[i] < 0 ||
@@ -1725,6 +1731,12 @@ static bool rb_meta_valid(struct ring_buffer_meta *meta, int cpu,
return false;
}
+ if (test_bit(meta->buffers[i], subbuf_mask)) {
+ pr_info("Ring buffer boot meta [%d] array has duplicates\n", cpu);
+ return false;
+ }
+
+ set_bit(meta->buffers[i], subbuf_mask);
subbuf = (void *)subbuf + subbuf_size;
}
@@ -1838,6 +1850,11 @@ static void rb_meta_validate_events(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->cpu);
goto invalid;
}
+
+ /* If the buffer has content, update pages_touched */
+ if (ret)
+ local_inc(&cpu_buffer->pages_touched);
+
entries += ret;
entry_bytes += local_read(&head_page->page->commit);
local_set(&cpu_buffer->head_page->entries, ret);
@@ -1889,17 +1906,22 @@ static void rb_meta_init_text_addr(struct ring_buffer_meta *meta)
static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
{
struct ring_buffer_meta *meta;
+ unsigned long *subbuf_mask;
unsigned long delta;
void *subbuf;
int cpu;
int i;
+ /* Create a mask to test the subbuf array */
+ subbuf_mask = bitmap_alloc(nr_pages + 1, GFP_KERNEL);
+ /* If subbuf_mask fails to allocate, then rb_meta_valid() will return false */
+
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
void *next_meta;
meta = rb_range_meta(buffer, nr_pages, cpu);
- if (rb_meta_valid(meta, cpu, buffer, nr_pages)) {
+ if (rb_meta_valid(meta, cpu, buffer, nr_pages, subbuf_mask)) {
/* Make the mappings match the current address */
subbuf = rb_subbufs_from_meta(meta);
delta = (unsigned long)subbuf - meta->first_buffer;
@@ -1943,6 +1965,7 @@ static void rb_range_meta_init(struct trace_buffer *buffer, int nr_pages)
subbuf += meta->subbuf_size;
}
}
+ bitmap_free(subbuf_mask);
}
static void *rbm_start(struct seq_file *m, loff_t *pos)
@@ -4398,8 +4421,13 @@ rb_reserve_next_event(struct trace_buffer *buffer,
int nr_loops = 0;
int add_ts_default;
- /* ring buffer does cmpxchg, make sure it is safe in NMI context */
- if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) &&
+ /*
+ * ring buffer does cmpxchg as well as atomic64 operations
+ * (which some archs use locking for atomic64), make sure this
+ * is safe in NMI context
+ */
+ if ((!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) ||
+ IS_ENABLED(CONFIG_GENERIC_ATOMIC64)) &&
(unlikely(in_nmi()))) {
return NULL;
}
@@ -4682,40 +4710,22 @@ int ring_buffer_write(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_write);
-static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+/*
+ * The total entries in the ring buffer is the running counter
+ * of entries entered into the ring buffer, minus the sum of
+ * the entries read from the ring buffer and the number of
+ * entries that were overwritten.
+ */
+static inline unsigned long
+rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
{
- struct buffer_page *reader = cpu_buffer->reader_page;
- struct buffer_page *head = rb_set_head_page(cpu_buffer);
- struct buffer_page *commit = cpu_buffer->commit_page;
-
- /* In case of error, head will be NULL */
- if (unlikely(!head))
- return true;
-
- /* Reader should exhaust content in reader page */
- if (reader->read != rb_page_size(reader))
- return false;
-
- /*
- * If writers are committing on the reader page, knowing all
- * committed content has been read, the ring buffer is empty.
- */
- if (commit == reader)
- return true;
-
- /*
- * If writers are committing on a page other than reader page
- * and head page, there should always be content to read.
- */
- if (commit != head)
- return false;
+ return local_read(&cpu_buffer->entries) -
+ (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
+}
- /*
- * Writers are committing on the head page, we just need
- * to care about there're committed data, and the reader will
- * swap reader page with head page when it is to read data.
- */
- return rb_page_commit(commit) == 0;
+static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return !rb_num_of_entries(cpu_buffer);
}
/**
@@ -4861,19 +4871,6 @@ void ring_buffer_record_enable_cpu(struct trace_buffer *buffer, int cpu)
}
EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
-/*
- * The total entries in the ring buffer is the running counter
- * of entries entered into the ring buffer, minus the sum of
- * the entries read from the ring buffer and the number of
- * entries that were overwritten.
- */
-static inline unsigned long
-rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
-{
- return local_read(&cpu_buffer->entries) -
- (local_read(&cpu_buffer->overrun) + cpu_buffer->read);
-}
-
/**
* ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
* @buffer: The ring buffer
@@ -7019,7 +7016,11 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
lockdep_assert_held(&cpu_buffer->mapping_lock);
nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */
- nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */
+ nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */
+ if (nr_pages <= pgoff)
+ return -EINVAL;
+
+ nr_pages -= pgoff;
nr_vma_pages = vma_pages(vma);
if (!nr_vma_pages || nr_vma_pages > nr_pages)
@@ -7055,7 +7056,7 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
}
while (p < nr_pages) {
- struct page *page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+ struct page *page;
int off = 0;
if (WARN_ON_ONCE(s >= nr_subbufs)) {
@@ -7063,6 +7064,8 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer,
goto out;
}
+ page = virt_to_page((void *)cpu_buffer->subbuf_ids[s]);
+
for (; off < (1 << (subbuf_order)); off++, page++) {
if (p >= nr_pages)
break;
@@ -7146,6 +7149,7 @@ int ring_buffer_map(struct trace_buffer *buffer, int cpu,
kfree(cpu_buffer->subbuf_ids);
cpu_buffer->subbuf_ids = NULL;
rb_free_meta_page(cpu_buffer);
+ atomic_dec(&cpu_buffer->resize_disabled);
}
unlock:
diff --git a/kernel/trace/rv/Kconfig b/kernel/trace/rv/Kconfig
index 831779607e849..8226352a00626 100644
--- a/kernel/trace/rv/Kconfig
+++ b/kernel/trace/rv/Kconfig
@@ -25,30 +25,9 @@ menuconfig RV
For further information, see:
Documentation/trace/rv/runtime-verification.rst
-config RV_MON_WIP
- depends on RV
- depends on PREEMPT_TRACER
- select DA_MON_EVENTS_IMPLICIT
- bool "wip monitor"
- help
- Enable wip (wakeup in preemptive) sample monitor that illustrates
- the usage of per-cpu monitors, and one limitation of the
- preempt_disable/enable events.
-
- For further information, see:
- Documentation/trace/rv/monitor_wip.rst
-
-config RV_MON_WWNR
- depends on RV
- select DA_MON_EVENTS_ID
- bool "wwnr monitor"
- help
- Enable wwnr (wakeup while not running) sample monitor, this is a
- sample monitor that illustrates the usage of per-task monitor.
- The model is borken on purpose: it serves to test reactors.
-
- For further information, see:
- Documentation/trace/rv/monitor_wwnr.rst
+source "kernel/trace/rv/monitors/wip/Kconfig"
+source "kernel/trace/rv/monitors/wwnr/Kconfig"
+# Add new monitors here
config RV_REACTORS
bool "Runtime verification reactors"
diff --git a/kernel/trace/rv/Makefile b/kernel/trace/rv/Makefile
index 963d14875b454..188b64668e1fa 100644
--- a/kernel/trace/rv/Makefile
+++ b/kernel/trace/rv/Makefile
@@ -1,8 +1,11 @@
# SPDX-License-Identifier: GPL-2.0
+ccflags-y += -I $(src) # needed for trace events
+
obj-$(CONFIG_RV) += rv.o
obj-$(CONFIG_RV_MON_WIP) += monitors/wip/wip.o
obj-$(CONFIG_RV_MON_WWNR) += monitors/wwnr/wwnr.o
+# Add new monitors here
obj-$(CONFIG_RV_REACTORS) += rv_reactors.o
obj-$(CONFIG_RV_REACT_PRINTK) += reactor_printk.o
obj-$(CONFIG_RV_REACT_PANIC) += reactor_panic.o
diff --git a/kernel/trace/rv/monitors/wip/Kconfig b/kernel/trace/rv/monitors/wip/Kconfig
new file mode 100644
index 0000000000000..3ef664b5cd903
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/Kconfig
@@ -0,0 +1,12 @@
+config RV_MON_WIP
+ depends on RV
+ depends on PREEMPT_TRACER
+ select DA_MON_EVENTS_IMPLICIT
+ bool "wip monitor"
+ help
+ Enable wip (wakeup in preemptive) sample monitor that illustrates
+ the usage of per-cpu monitors, and one limitation of the
+ preempt_disable/enable events.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wip.rst
diff --git a/kernel/trace/rv/monitors/wip/wip.c b/kernel/trace/rv/monitors/wip/wip.c
index b2b49a27e8863..db7389157c87e 100644
--- a/kernel/trace/rv/monitors/wip/wip.c
+++ b/kernel/trace/rv/monitors/wip/wip.c
@@ -10,7 +10,7 @@
#define MODULE_NAME "wip"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
#include <trace/events/preemptirq.h>
diff --git a/kernel/trace/rv/monitors/wip/wip_trace.h b/kernel/trace/rv/monitors/wip/wip_trace.h
new file mode 100644
index 0000000000000..aa2162f47a4c3
--- /dev/null
+++ b/kernel/trace/rv/monitors/wip/wip_trace.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WIP
+DEFINE_EVENT(event_da_monitor, event_wip,
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor, error_wip,
+ TP_PROTO(char *state, char *event),
+ TP_ARGS(state, event));
+#endif /* CONFIG_RV_MON_WIP */
diff --git a/kernel/trace/rv/monitors/wwnr/Kconfig b/kernel/trace/rv/monitors/wwnr/Kconfig
new file mode 100644
index 0000000000000..ee741aa6d6b89
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/Kconfig
@@ -0,0 +1,11 @@
+config RV_MON_WWNR
+ depends on RV
+ select DA_MON_EVENTS_ID
+ bool "wwnr monitor"
+ help
+ Enable wwnr (wakeup while not running) sample monitor, this is a
+ sample monitor that illustrates the usage of per-task monitor.
+ The model is borken on purpose: it serves to test reactors.
+
+ For further information, see:
+ Documentation/trace/rv/monitor_wwnr.rst
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr.c b/kernel/trace/rv/monitors/wwnr/wwnr.c
index 0e43dd2db685d..3b16994a99845 100644
--- a/kernel/trace/rv/monitors/wwnr/wwnr.c
+++ b/kernel/trace/rv/monitors/wwnr/wwnr.c
@@ -10,7 +10,7 @@
#define MODULE_NAME "wwnr"
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#include <trace/events/sched.h>
#include "wwnr.h"
diff --git a/kernel/trace/rv/monitors/wwnr/wwnr_trace.h b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
new file mode 100644
index 0000000000000..fc97ec7476ad1
--- /dev/null
+++ b/kernel/trace/rv/monitors/wwnr/wwnr_trace.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Snippet to be included in rv_trace.h
+ */
+
+#ifdef CONFIG_RV_MON_WWNR
+/* id is the pid of the task */
+DEFINE_EVENT(event_da_monitor_id, event_wwnr,
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+ TP_ARGS(id, state, event, next_state, final_state));
+
+DEFINE_EVENT(error_da_monitor_id, error_wwnr,
+ TP_PROTO(int id, char *state, char *event),
+ TP_ARGS(id, state, event));
+#endif /* CONFIG_RV_MON_WWNR */
diff --git a/kernel/trace/rv/rv.c b/kernel/trace/rv/rv.c
index 279c70e1bd745..8657fc8806e7c 100644
--- a/kernel/trace/rv/rv.c
+++ b/kernel/trace/rv/rv.c
@@ -145,7 +145,7 @@
#ifdef CONFIG_DA_MON_EVENTS
#define CREATE_TRACE_POINTS
-#include <trace/events/rv.h>
+#include <rv_trace.h>
#endif
#include "rv.h"
diff --git a/kernel/trace/rv/rv_trace.h b/kernel/trace/rv/rv_trace.h
new file mode 100644
index 0000000000000..5e65097423ba4
--- /dev/null
+++ b/kernel/trace/rv/rv_trace.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rv
+
+#if !defined(_TRACE_RV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RV_H
+
+#include <linux/rv.h>
+#include <linux/tracepoint.h>
+
+#ifdef CONFIG_DA_MON_EVENTS_IMPLICIT
+DECLARE_EVENT_CLASS(event_da_monitor,
+
+ TP_PROTO(char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ __array( char, next_state, MAX_DA_NAME_LEN )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%s x %s -> %s %s",
+ __entry->state,
+ __entry->event,
+ __entry->next_state,
+ __entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor,
+
+ TP_PROTO(char *state, char *event),
+
+ TP_ARGS(state, event),
+
+ TP_STRUCT__entry(
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ ),
+
+ TP_printk("event %s not expected in the state %s",
+ __entry->event,
+ __entry->state)
+);
+
+#include <monitors/wip/wip_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_IMPLICIT here
+
+#endif /* CONFIG_DA_MON_EVENTS_IMPLICIT */
+
+#ifdef CONFIG_DA_MON_EVENTS_ID
+DECLARE_EVENT_CLASS(event_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event, char *next_state, bool final_state),
+
+ TP_ARGS(id, state, event, next_state, final_state),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ __array( char, next_state, MAX_DA_NAME_LEN )
+ __field( bool, final_state )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ memcpy(__entry->next_state, next_state, MAX_DA_NAME_LEN);
+ __entry->id = id;
+ __entry->final_state = final_state;
+ ),
+
+ TP_printk("%d: %s x %s -> %s %s",
+ __entry->id,
+ __entry->state,
+ __entry->event,
+ __entry->next_state,
+ __entry->final_state ? "(final)" : "")
+);
+
+DECLARE_EVENT_CLASS(error_da_monitor_id,
+
+ TP_PROTO(int id, char *state, char *event),
+
+ TP_ARGS(id, state, event),
+
+ TP_STRUCT__entry(
+ __field( int, id )
+ __array( char, state, MAX_DA_NAME_LEN )
+ __array( char, event, MAX_DA_NAME_LEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->state, state, MAX_DA_NAME_LEN);
+ memcpy(__entry->event, event, MAX_DA_NAME_LEN);
+ __entry->id = id;
+ ),
+
+ TP_printk("%d: event %s not expected in the state %s",
+ __entry->id,
+ __entry->event,
+ __entry->state)
+);
+
+#include <monitors/wwnr/wwnr_trace.h>
+// Add new monitors based on CONFIG_DA_MON_EVENTS_ID here
+
+#endif /* CONFIG_DA_MON_EVENTS_ID */
+#endif /* _TRACE_RV_H */
+
+/* This part ust be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE rv_trace
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index be62f0ea1814d..0e6d517e74e0f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -26,6 +26,7 @@
#include <linux/hardirq.h>
#include <linux/linkage.h>
#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
@@ -535,19 +536,16 @@ LIST_HEAD(ftrace_trace_arrays);
int trace_array_get(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret = -ENODEV;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (tr == this_tr) {
tr->ref++;
- ret = 0;
- break;
+ return 0;
}
}
- mutex_unlock(&trace_types_lock);
- return ret;
+ return -ENODEV;
}
static void __trace_array_put(struct trace_array *this_tr)
@@ -1443,22 +1441,20 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
cond_update_fn_t update)
{
- struct cond_snapshot *cond_snapshot;
- int ret = 0;
+ struct cond_snapshot *cond_snapshot __free(kfree) =
+ kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
+ int ret;
- cond_snapshot = kzalloc(sizeof(*cond_snapshot), GFP_KERNEL);
if (!cond_snapshot)
return -ENOMEM;
cond_snapshot->cond_data = cond_data;
cond_snapshot->update = update;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto fail_unlock;
- }
+ if (tr->current_trace->use_max_tr)
+ return -EBUSY;
/*
* The cond_snapshot can only change to NULL without the
@@ -1468,29 +1464,20 @@ int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data,
* do safely with only holding the trace_types_lock and not
* having to take the max_lock.
*/
- if (tr->cond_snapshot) {
- ret = -EBUSY;
- goto fail_unlock;
- }
+ if (tr->cond_snapshot)
+ return -EBUSY;
ret = tracing_arm_snapshot_locked(tr);
if (ret)
- goto fail_unlock;
+ return ret;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
- tr->cond_snapshot = cond_snapshot;
+ tr->cond_snapshot = no_free_ptr(cond_snapshot);
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
- mutex_unlock(&trace_types_lock);
-
- return ret;
-
- fail_unlock:
- mutex_unlock(&trace_types_lock);
- kfree(cond_snapshot);
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(tracing_snapshot_cond_enable);
@@ -2203,10 +2190,10 @@ static __init int init_trace_selftests(void)
selftests_can_run = true;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (list_empty(&postponed_selftests))
- goto out;
+ return 0;
pr_info("Running postponed tracer tests:\n");
@@ -2235,9 +2222,6 @@ static __init int init_trace_selftests(void)
}
tracing_selftest_running = false;
- out:
- mutex_unlock(&trace_types_lock);
-
return 0;
}
core_initcall(init_trace_selftests);
@@ -2807,7 +2791,7 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
int save_tracepoint_printk;
int ret;
- mutex_lock(&tracepoint_printk_mutex);
+ guard(mutex)(&tracepoint_printk_mutex);
save_tracepoint_printk = tracepoint_printk;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
@@ -2820,16 +2804,13 @@ int tracepoint_printk_sysctl(const struct ctl_table *table, int write,
tracepoint_printk = 0;
if (save_tracepoint_printk == tracepoint_printk)
- goto out;
+ return ret;
if (tracepoint_printk)
static_key_enable(&tracepoint_printk_key.key);
else
static_key_disable(&tracepoint_printk_key.key);
- out:
- mutex_unlock(&tracepoint_printk_mutex);
-
return ret;
}
@@ -3611,17 +3592,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter)
}
/* Returns true if the string is safe to dereference from an event */
-static bool trace_safe_str(struct trace_iterator *iter, const char *str,
- bool star, int len)
+static bool trace_safe_str(struct trace_iterator *iter, const char *str)
{
unsigned long addr = (unsigned long)str;
struct trace_event *trace_event;
struct trace_event_call *event;
- /* Ignore strings with no length */
- if (star && !len)
- return true;
-
/* OK if part of the event data */
if ((addr >= (unsigned long)iter->ent) &&
(addr < (unsigned long)iter->ent + iter->ent_size))
@@ -3661,181 +3637,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
return false;
}
-static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
-
-static int test_can_verify_check(const char *fmt, ...)
-{
- char buf[16];
- va_list ap;
- int ret;
-
- /*
- * The verifier is dependent on vsnprintf() modifies the va_list
- * passed to it, where it is sent as a reference. Some architectures
- * (like x86_32) passes it by value, which means that vsnprintf()
- * does not modify the va_list passed to it, and the verifier
- * would then need to be able to understand all the values that
- * vsnprintf can use. If it is passed by value, then the verifier
- * is disabled.
- */
- va_start(ap, fmt);
- vsnprintf(buf, 16, "%d", ap);
- ret = va_arg(ap, int);
- va_end(ap);
-
- return ret;
-}
-
-static void test_can_verify(void)
-{
- if (!test_can_verify_check("%d %d", 0, 1)) {
- pr_info("trace event string verifier disabled\n");
- static_branch_inc(&trace_no_verify);
- }
-}
-
/**
- * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
+ * ignore_event - Check dereferenced fields while writing to the seq buffer
* @iter: The iterator that holds the seq buffer and the event being printed
- * @fmt: The format used to print the event
- * @ap: The va_list holding the data to print from @fmt.
*
- * This writes the data into the @iter->seq buffer using the data from
- * @fmt and @ap. If the format has a %s, then the source of the string
- * is examined to make sure it is safe to print, otherwise it will
- * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
- * pointer.
+ * At boot up, test_event_printk() will flag any event that dereferences
+ * a string with "%s" that does exist in the ring buffer. It may still
+ * be valid, as the string may point to a static string in the kernel
+ * rodata that never gets freed. But if the string pointer is pointing
+ * to something that was allocated, there's a chance that it can be freed
+ * by the time the user reads the trace. This would cause a bad memory
+ * access by the kernel and possibly crash the system.
+ *
+ * This function will check if the event has any fields flagged as needing
+ * to be checked at runtime and perform those checks.
+ *
+ * If it is found that a field is unsafe, it will write into the @iter->seq
+ * a message stating what was found to be unsafe.
+ *
+ * @return: true if the event is unsafe and should be ignored,
+ * false otherwise.
*/
-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap)
+bool ignore_event(struct trace_iterator *iter)
{
- long text_delta = 0;
- long data_delta = 0;
- const char *p = fmt;
- const char *str;
- bool good;
- int i, j;
+ struct ftrace_event_field *field;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+ struct list_head *head;
+ struct trace_seq *seq;
+ const void *ptr;
- if (WARN_ON_ONCE(!fmt))
- return;
+ trace_event = ftrace_find_event(iter->ent->type);
- if (static_branch_unlikely(&trace_no_verify))
- goto print;
+ seq = &iter->seq;
- /*
- * When the kernel is booted with the tp_printk command line
- * parameter, trace events go directly through to printk().
- * It also is checked by this function, but it does not
- * have an associated trace_array (tr) for it.
- */
- if (iter->tr) {
- text_delta = iter->tr->text_delta;
- data_delta = iter->tr->data_delta;
+ if (!trace_event) {
+ trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type);
+ return true;
}
- /* Don't bother checking when doing a ftrace_dump() */
- if (iter->fmt == static_fmt_buf)
- goto print;
-
- while (*p) {
- bool star = false;
- int len = 0;
-
- j = 0;
-
- /*
- * We only care about %s and variants
- * as well as %p[sS] if delta is non-zero
- */
- for (i = 0; p[i]; i++) {
- if (i + 1 >= iter->fmt_size) {
- /*
- * If we can't expand the copy buffer,
- * just print it.
- */
- if (!trace_iter_expand_format(iter))
- goto print;
- }
-
- if (p[i] == '\\' && p[i+1]) {
- i++;
- continue;
- }
- if (p[i] == '%') {
- /* Need to test cases like %08.*s */
- for (j = 1; p[i+j]; j++) {
- if (isdigit(p[i+j]) ||
- p[i+j] == '.')
- continue;
- if (p[i+j] == '*') {
- star = true;
- continue;
- }
- break;
- }
- if (p[i+j] == 's')
- break;
-
- if (text_delta && p[i+1] == 'p' &&
- ((p[i+2] == 's' || p[i+2] == 'S')))
- break;
-
- star = false;
- }
- j = 0;
- }
- /* If no %s found then just print normally */
- if (!p[i])
- break;
-
- /* Copy up to the %s, and print that */
- strncpy(iter->fmt, p, i);
- iter->fmt[i] = '\0';
- trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+ event = container_of(trace_event, struct trace_event_call, event);
+ if (!(event->flags & TRACE_EVENT_FL_TEST_STR))
+ return false;
- /* Add delta to %pS pointers */
- if (p[i+1] == 'p') {
- unsigned long addr;
- char fmt[4];
+ head = trace_get_fields(event);
+ if (!head) {
+ trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n",
+ trace_event_name(event));
+ return true;
+ }
- fmt[0] = '%';
- fmt[1] = 'p';
- fmt[2] = p[i+2]; /* Either %ps or %pS */
- fmt[3] = '\0';
+ /* Offsets are from the iter->ent that points to the raw event */
+ ptr = iter->ent;
- addr = va_arg(ap, unsigned long);
- addr += text_delta;
- trace_seq_printf(&iter->seq, fmt, (void *)addr);
+ list_for_each_entry(field, head, link) {
+ const char *str;
+ bool good;
- p += i + 3;
+ if (!field->needs_test)
continue;
- }
- /*
- * If iter->seq is full, the above call no longer guarantees
- * that ap is in sync with fmt processing, and further calls
- * to va_arg() can return wrong positional arguments.
- *
- * Ensure that ap is no longer used in this case.
- */
- if (iter->seq.full) {
- p = "";
- break;
- }
-
- if (star)
- len = va_arg(ap, int);
-
- /* The ap now points to the string data of the %s */
- str = va_arg(ap, const char *);
+ str = *(const char **)(ptr + field->offset);
- good = trace_safe_str(iter, str, star, len);
-
- /* Could be from the last boot */
- if (data_delta && !good) {
- str += data_delta;
- good = trace_safe_str(iter, str, star, len);
- }
+ good = trace_safe_str(iter, str);
/*
* If you hit this warning, it is likely that the
@@ -3846,44 +3710,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
* instead. See samples/trace_events/trace-events-sample.h
* for reference.
*/
- if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
- fmt, seq_buf_str(&iter->seq.seq))) {
- int ret;
-
- /* Try to safely read the string */
- if (star) {
- if (len + 1 > iter->fmt_size)
- len = iter->fmt_size - 1;
- if (len < 0)
- len = 0;
- ret = copy_from_kernel_nofault(iter->fmt, str, len);
- iter->fmt[len] = 0;
- star = false;
- } else {
- ret = strncpy_from_kernel_nofault(iter->fmt, str,
- iter->fmt_size);
- }
- if (ret < 0)
- trace_seq_printf(&iter->seq, "(0x%px)", str);
- else
- trace_seq_printf(&iter->seq, "(0x%px:%s)",
- str, iter->fmt);
- str = "[UNSAFE-MEMORY]";
- strcpy(iter->fmt, "%s");
- } else {
- strncpy(iter->fmt, p + i, j + 1);
- iter->fmt[j+1] = '\0';
+ if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'",
+ trace_event_name(event), field->name)) {
+ trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n",
+ trace_event_name(event), field->name);
+ return true;
}
- if (star)
- trace_seq_printf(&iter->seq, iter->fmt, len, str);
- else
- trace_seq_printf(&iter->seq, iter->fmt, str);
-
- p += i + j + 1;
}
- print:
- if (*p)
- trace_seq_vprintf(&iter->seq, p, ap);
+ return false;
}
const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
@@ -4269,6 +4103,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
preempt_model_none() ? "server" :
preempt_model_voluntary() ? "desktop" :
preempt_model_full() ? "preempt" :
+ preempt_model_lazy() ? "lazy" :
preempt_model_rt() ? "preempt_rt" :
"unknown",
/* These are reserved for later use */
@@ -4353,6 +4188,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
if (event) {
if (tr->trace_flags & TRACE_ITER_FIELDS)
return print_event_fields(iter, event);
+ /*
+ * For TRACE_EVENT() events, the print_fmt is not
+ * safe to use if the array has delta offsets
+ * Force printing via the fields.
+ */
+ if ((tr->text_delta || tr->data_delta) &&
+ event->type > __TRACE_LAST_TYPE)
+ return print_event_fields(iter, event);
+
return event->funcs->trace(iter, sym_flags, event);
}
@@ -5225,6 +5069,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
cpumask_var_t tracing_cpumask_new;
int err;
+ if (count == 0 || count > KMALLOC_MAX_SIZE)
+ return -EINVAL;
+
if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
@@ -5261,7 +5108,8 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
u32 tracer_flags;
int i;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
+
tracer_flags = tr->current_trace->flags->val;
trace_opts = tr->current_trace->flags->opts;
@@ -5278,7 +5126,6 @@ static int tracing_trace_options_show(struct seq_file *m, void *v)
else
seq_printf(m, "no%s\n", trace_opts[i].name);
}
- mutex_unlock(&trace_types_lock);
return 0;
}
@@ -5671,6 +5518,8 @@ static const char readme_msg[] =
"\t efield: For event probes ('e' types), the field is on of the fields\n"
"\t of the <attached-group>/<attached-event>.\n"
#endif
+ " set_event\t\t- Enables events by name written into it\n"
+ "\t\t\t Can enable module events via: :mod:<module>\n"
" events/\t\t- Directory containing all trace event subsystems:\n"
" enable\t\t- Write 0/1 to enable/disable tracing of all events\n"
" events/<system>/\t- Directory containing all trace events for <system>:\n"
@@ -5943,7 +5792,7 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
return;
}
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
if (!trace_eval_maps)
trace_eval_maps = map_array;
@@ -5967,8 +5816,6 @@ trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
map_array++;
}
memset(map_array, 0, sizeof(*map_array));
-
- mutex_unlock(&trace_eval_mutex);
}
static void trace_create_eval_file(struct dentry *d_tracer)
@@ -6130,26 +5977,15 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
unsigned long size, int cpu_id)
{
- int ret;
-
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (cpu_id != RING_BUFFER_ALL_CPUS) {
/* make sure, this cpu is enabled in the mask */
- if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!cpumask_test_cpu(cpu_id, tracing_buffer_mask))
+ return -EINVAL;
}
- ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
- if (ret < 0)
- ret = -ENOMEM;
-
-out:
- mutex_unlock(&trace_types_lock);
-
- return ret;
+ return __tracing_resize_ring_buffer(tr, size, cpu_id);
}
static void update_last_data(struct trace_array *tr)
@@ -6240,9 +6076,9 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
#endif
- int ret = 0;
+ int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
update_last_data(tr);
@@ -6250,7 +6086,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
- goto out;
+ return ret;
ret = 0;
}
@@ -6258,43 +6094,37 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (strcmp(t->name, buf) == 0)
break;
}
- if (!t) {
- ret = -EINVAL;
- goto out;
- }
+ if (!t)
+ return -EINVAL;
+
if (t == tr->current_trace)
- goto out;
+ return 0;
#ifdef CONFIG_TRACER_SNAPSHOT
if (t->use_max_tr) {
local_irq_disable();
arch_spin_lock(&tr->max_lock);
- if (tr->cond_snapshot)
- ret = -EBUSY;
+ ret = tr->cond_snapshot ? -EBUSY : 0;
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
- goto out;
+ return ret;
}
#endif
/* Some tracers won't work on kernel command line */
if (system_state < SYSTEM_RUNNING && t->noboot) {
pr_warn("Tracer '%s' is not allowed on command line, ignored\n",
t->name);
- goto out;
+ return -EINVAL;
}
/* Some tracers are only allowed for the top level buffer */
- if (!trace_ok_for_array(t, tr)) {
- ret = -EINVAL;
- goto out;
- }
+ if (!trace_ok_for_array(t, tr))
+ return -EINVAL;
/* If trace pipe files are being read, we can't change the tracer */
- if (tr->trace_ref) {
- ret = -EBUSY;
- goto out;
- }
+ if (tr->trace_ref)
+ return -EBUSY;
trace_branch_disable();
@@ -6325,7 +6155,7 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (!had_max_tr && t->use_max_tr) {
ret = tracing_arm_snapshot_locked(tr);
if (ret)
- goto out;
+ return ret;
}
#else
tr->current_trace = &nop_trace;
@@ -6338,17 +6168,15 @@ int tracing_set_tracer(struct trace_array *tr, const char *buf)
if (t->use_max_tr)
tracing_disarm_snapshot(tr);
#endif
- goto out;
+ return ret;
}
}
tr->current_trace = t;
tr->current_trace->enabled++;
trace_branch_enable(tr);
- out:
- mutex_unlock(&trace_types_lock);
- return ret;
+ return 0;
}
static ssize_t
@@ -6426,22 +6254,18 @@ tracing_thresh_write(struct file *filp, const char __user *ubuf,
struct trace_array *tr = filp->private_data;
int ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
if (ret < 0)
- goto out;
+ return ret;
if (tr->current_trace->update_thresh) {
ret = tr->current_trace->update_thresh(tr);
if (ret < 0)
- goto out;
+ return ret;
}
- ret = cnt;
-out:
- mutex_unlock(&trace_types_lock);
-
- return ret;
+ return cnt;
}
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6660,31 +6484,29 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
* This is just a matter of traces coherency, the ring buffer itself
* is protected.
*/
- mutex_lock(&iter->mutex);
+ guard(mutex)(&iter->mutex);
/* return any leftover data */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
if (sret != -EBUSY)
- goto out;
+ return sret;
trace_seq_init(&iter->seq);
if (iter->trace->read) {
sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
if (sret)
- goto out;
+ return sret;
}
waitagain:
sret = tracing_wait_pipe(filp);
if (sret <= 0)
- goto out;
+ return sret;
/* stop when tracing is finished */
- if (trace_empty(iter)) {
- sret = 0;
- goto out;
- }
+ if (trace_empty(iter))
+ return 0;
if (cnt >= TRACE_SEQ_BUFFER_SIZE)
cnt = TRACE_SEQ_BUFFER_SIZE - 1;
@@ -6748,9 +6570,6 @@ waitagain:
if (sret == -EBUSY)
goto waitagain;
-out:
- mutex_unlock(&iter->mutex);
-
return sret;
}
@@ -7342,25 +7161,19 @@ u64 tracing_event_time_stamp(struct trace_buffer *buffer, struct ring_buffer_eve
*/
int tracing_set_filter_buffering(struct trace_array *tr, bool set)
{
- int ret = 0;
-
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (set && tr->no_filter_buffering_ref++)
- goto out;
+ return 0;
if (!set) {
- if (WARN_ON_ONCE(!tr->no_filter_buffering_ref)) {
- ret = -EINVAL;
- goto out;
- }
+ if (WARN_ON_ONCE(!tr->no_filter_buffering_ref))
+ return -EINVAL;
--tr->no_filter_buffering_ref;
}
- out:
- mutex_unlock(&trace_types_lock);
- return ret;
+ return 0;
}
struct ftrace_buffer_info {
@@ -7436,12 +7249,10 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
- if (tr->current_trace->use_max_tr) {
- ret = -EBUSY;
- goto out;
- }
+ if (tr->current_trace->use_max_tr)
+ return -EBUSY;
local_irq_disable();
arch_spin_lock(&tr->max_lock);
@@ -7450,24 +7261,20 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
arch_spin_unlock(&tr->max_lock);
local_irq_enable();
if (ret)
- goto out;
+ return ret;
switch (val) {
case 0:
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
if (tr->allocated_snapshot)
free_snapshot(tr);
break;
case 1:
/* Only allow per-cpu swap if the ring buffer supports it */
#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
- if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
- ret = -EINVAL;
- break;
- }
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS)
+ return -EINVAL;
#endif
if (tr->allocated_snapshot)
ret = resize_buffer_duplicate_size(&tr->max_buffer,
@@ -7475,7 +7282,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
ret = tracing_arm_snapshot_locked(tr);
if (ret)
- break;
+ return ret;
/* Now, we're going to swap */
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
@@ -7502,8 +7309,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
ret = cnt;
}
-out:
- mutex_unlock(&trace_types_lock);
+
return ret;
}
@@ -7889,12 +7695,11 @@ void tracing_log_err(struct trace_array *tr,
len += sizeof(CMD_PREFIX) + 2 * sizeof("\n") + strlen(cmd) + 1;
- mutex_lock(&tracing_err_log_lock);
+ guard(mutex)(&tracing_err_log_lock);
+
err = get_tracing_log_err(tr, len);
- if (PTR_ERR(err) == -ENOMEM) {
- mutex_unlock(&tracing_err_log_lock);
+ if (PTR_ERR(err) == -ENOMEM)
return;
- }
snprintf(err->loc, TRACING_LOG_LOC_MAX, "%s: error: ", loc);
snprintf(err->cmd, len, "\n" CMD_PREFIX "%s\n", cmd);
@@ -7905,7 +7710,6 @@ void tracing_log_err(struct trace_array *tr,
err->info.ts = local_clock();
list_add_tail(&err->list, &tr->err_log);
- mutex_unlock(&tracing_err_log_lock);
}
static void clear_tracing_err_log(struct trace_array *tr)
@@ -8475,6 +8279,10 @@ static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma)
struct trace_iterator *iter = &info->iter;
int ret = 0;
+ /* Currently the boot mapped buffer is not supported for mmap */
+ if (iter->tr->flags & TRACE_ARRAY_FL_BOOT)
+ return -ENODEV;
+
ret = get_snapshot_map(iter->tr);
if (ret)
return ret;
@@ -9601,6 +9409,10 @@ trace_array_create_systems(const char *name, const char *systems,
INIT_LIST_HEAD(&tr->hist_vars);
INIT_LIST_HEAD(&tr->err_log);
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&tr->mod_events);
+#endif
+
if (allocate_trace_buffers(tr, trace_buf_size) < 0)
goto out_free_tr;
@@ -9649,20 +9461,17 @@ static int instance_mkdir(const char *name)
struct trace_array *tr;
int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
ret = -EEXIST;
if (trace_array_find(name))
- goto out_unlock;
+ return -EEXIST;
tr = trace_array_create(name);
ret = PTR_ERR_OR_ZERO(tr);
-out_unlock:
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return ret;
}
@@ -9712,24 +9521,23 @@ struct trace_array *trace_array_get_by_name(const char *name, const char *system
{
struct trace_array *tr;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr->name && strcmp(tr->name, name) == 0)
- goto out_unlock;
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ tr->ref++;
+ return tr;
+ }
}
tr = trace_array_create_systems(name, systems, 0, 0);
if (IS_ERR(tr))
tr = NULL;
-out_unlock:
- if (tr)
+ else
tr->ref++;
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
return tr;
}
EXPORT_SYMBOL_GPL(trace_array_get_by_name);
@@ -9780,48 +9588,36 @@ static int __remove_instance(struct trace_array *tr)
int trace_array_destroy(struct trace_array *this_tr)
{
struct trace_array *tr;
- int ret;
if (!this_tr)
return -EINVAL;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
/* Making sure trace array exists before destroying it. */
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
- if (tr == this_tr) {
- ret = __remove_instance(tr);
- break;
- }
+ if (tr == this_tr)
+ return __remove_instance(tr);
}
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
-
- return ret;
+ return -ENODEV;
}
EXPORT_SYMBOL_GPL(trace_array_destroy);
static int instance_rmdir(const char *name)
{
struct trace_array *tr;
- int ret;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
- ret = -ENODEV;
tr = trace_array_find(name);
- if (tr)
- ret = __remove_instance(tr);
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
+ if (!tr)
+ return -ENODEV;
- return ret;
+ return __remove_instance(tr);
}
static __init void create_trace_instances(struct dentry *d_tracer)
@@ -9834,19 +9630,16 @@ static __init void create_trace_instances(struct dentry *d_tracer)
if (MEM_FAIL(!trace_instance_dir, "Failed to create instances directory\n"))
return;
- mutex_lock(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&event_mutex);
+ guard(mutex)(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
if (!tr->name)
continue;
if (MEM_FAIL(trace_array_create_dir(tr) < 0,
"Failed to create instance directory\n"))
- break;
+ return;
}
-
- mutex_unlock(&trace_types_lock);
- mutex_unlock(&event_mutex);
}
static void
@@ -10036,6 +9829,24 @@ late_initcall_sync(trace_eval_sync);
#ifdef CONFIG_MODULES
+
+bool module_exists(const char *module)
+{
+ /* All modules have the symbol __this_module */
+ static const char this_mod[] = "__this_module";
+ char modname[MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 2];
+ unsigned long val;
+ int n;
+
+ n = snprintf(modname, sizeof(modname), "%s:%s", module, this_mod);
+
+ if (n > sizeof(modname) - 1)
+ return false;
+
+ val = module_kallsyms_lookup_name(modname);
+ return val != 0;
+}
+
static void trace_module_add_evals(struct module *mod)
{
if (!mod->num_trace_evals)
@@ -10060,7 +9871,7 @@ static void trace_module_remove_evals(struct module *mod)
if (!mod->num_trace_evals)
return;
- mutex_lock(&trace_eval_mutex);
+ guard(mutex)(&trace_eval_mutex);
map = trace_eval_maps;
@@ -10072,12 +9883,10 @@ static void trace_module_remove_evals(struct module *mod)
map = map->tail.next;
}
if (!map)
- goto out;
+ return;
*last = trace_eval_jmp_to_tail(map)->tail.next;
kfree(map);
- out:
- mutex_unlock(&trace_eval_mutex);
}
#else
static inline void trace_module_remove_evals(struct module *mod) { }
@@ -10750,6 +10559,10 @@ __init static int tracer_alloc_buffers(void)
#endif
ftrace_init_global_array_ops(&global_trace);
+#ifdef CONFIG_MODULES
+ INIT_LIST_HEAD(&global_trace.mod_events);
+#endif
+
init_trace_flags_index(&global_trace);
register_tracer(&nop_trace);
@@ -10777,8 +10590,6 @@ __init static int tracer_alloc_buffers(void)
register_snapshot_cmd();
- test_can_verify();
-
return 0;
out_free_pipe_cpumask:
@@ -10797,6 +10608,14 @@ out:
return ret;
}
+#ifdef CONFIG_FUNCTION_TRACER
+/* Used to set module cached ftrace filtering at boot up */
+__init struct trace_array *trace_get_global_array(void)
+{
+ return &global_trace;
+}
+#endif
+
void __init ftrace_boot_snapshot(void)
{
#ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 266740b4e1212..9c21ba45b7af6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -400,6 +400,9 @@ struct trace_array {
cpumask_var_t pipe_cpumask;
int ref;
int trace_ref;
+#ifdef CONFIG_MODULES
+ struct list_head mod_events;
+#endif
#ifdef CONFIG_FUNCTION_TRACER
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
@@ -432,8 +435,18 @@ struct trace_array {
enum {
TRACE_ARRAY_FL_GLOBAL = BIT(0),
TRACE_ARRAY_FL_BOOT = BIT(1),
+ TRACE_ARRAY_FL_MOD_INIT = BIT(2),
};
+#ifdef CONFIG_MODULES
+bool module_exists(const char *module);
+#else
+static inline bool module_exists(const char *module)
+{
+ return false;
+}
+#endif
+
extern struct list_head ftrace_trace_arrays;
extern struct mutex trace_types_lock;
@@ -667,9 +680,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer,
bool trace_is_tracepoint_string(const char *str);
const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
- va_list ap) __printf(2, 0);
char *trace_iter_expand_format(struct trace_iterator *iter);
+bool ignore_event(struct trace_iterator *iter);
int trace_empty(struct trace_iterator *iter);
@@ -694,8 +706,10 @@ void trace_latency_header(struct seq_file *m);
void trace_default_header(struct seq_file *m);
void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
-void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops);
-int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops);
+void trace_graph_return(struct ftrace_graph_ret *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
+int trace_graph_entry(struct ftrace_graph_ent *trace, struct fgraph_ops *gops,
+ struct ftrace_regs *fregs);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
@@ -718,8 +732,6 @@ extern unsigned long tracing_thresh;
/* PID filtering */
-extern int pid_max;
-
bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids,
pid_t search_pid);
bool trace_ignore_this_task(struct trace_pid_list *filtered_pids,
@@ -912,7 +924,9 @@ extern int __trace_graph_retaddr_entry(struct trace_array *tr,
unsigned long retaddr);
extern void __trace_graph_return(struct trace_array *tr,
struct ftrace_graph_ret *trace,
- unsigned int trace_ctx);
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime);
+
extern void init_array_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
extern int allocate_fgraph_ops(struct trace_array *tr, struct ftrace_ops *ops);
extern void free_fgraph_ops(struct trace_array *tr);
@@ -1115,6 +1129,7 @@ void ftrace_destroy_function_files(struct trace_array *tr);
int ftrace_allocate_ftrace_ops(struct trace_array *tr);
void ftrace_free_ftrace_ops(struct trace_array *tr);
void ftrace_init_global_array_ops(struct trace_array *tr);
+struct trace_array *trace_get_global_array(void);
void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func);
void ftrace_reset_array_ops(struct trace_array *tr);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
@@ -1413,7 +1428,8 @@ struct ftrace_event_field {
int filter_type;
int offset;
int size;
- int is_signed;
+ unsigned int is_signed:1;
+ unsigned int needs_test:1;
int len;
};
diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c
index 4376887e0d8aa..a322e4f249a50 100644
--- a/kernel/trace/trace_dynevent.c
+++ b/kernel/trace/trace_dynevent.c
@@ -74,24 +74,19 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
struct dyn_event *pos, *n;
char *system = NULL, *event, *p;
int argc, ret = -ENOENT;
- char **argv;
+ char **argv __free(argv_free) = argv_split(GFP_KERNEL, raw_command, &argc);
- argv = argv_split(GFP_KERNEL, raw_command, &argc);
if (!argv)
return -ENOMEM;
if (argv[0][0] == '-') {
- if (argv[0][1] != ':') {
- ret = -EINVAL;
- goto out;
- }
+ if (argv[0][1] != ':')
+ return -EINVAL;
event = &argv[0][2];
} else {
event = strchr(argv[0], ':');
- if (!event) {
- ret = -EINVAL;
- goto out;
- }
+ if (!event)
+ return -EINVAL;
event++;
}
@@ -101,10 +96,8 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
event = p + 1;
*p = '\0';
}
- if (!system && event[0] == '\0') {
- ret = -EINVAL;
- goto out;
- }
+ if (!system && event[0] == '\0')
+ return -EINVAL;
mutex_lock(&event_mutex);
for_each_dyn_event_safe(pos, n) {
@@ -120,8 +113,6 @@ int dyn_event_release(const char *raw_command, struct dyn_event_operations *type
}
tracing_reset_all_online_cpus();
mutex_unlock(&event_mutex);
-out:
- argv_free(argv);
return ret;
}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 82fd174ebbe01..fbfb396905a6b 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -124,8 +124,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_packed( unsigned long, ret, retval )
__field_packed( int, ret, depth )
__field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field(unsigned long long, calltime )
+ __field(unsigned long long, rettime )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d retval: %lx",
@@ -146,8 +146,8 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry,
__field_packed( unsigned long, ret, func )
__field_packed( int, ret, depth )
__field_packed( unsigned int, ret, overrun )
- __field_packed( unsigned long long, ret, calltime)
- __field_packed( unsigned long long, ret, rettime )
+ __field(unsigned long long, calltime )
+ __field(unsigned long long, rettime )
),
F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d",
diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c
index be8be0c1aaf0f..82fd637cfc19e 100644
--- a/kernel/trace/trace_eprobe.c
+++ b/kernel/trace/trace_eprobe.c
@@ -917,10 +917,10 @@ static int __trace_eprobe_create(int argc, const char *argv[])
goto error;
}
- mutex_lock(&event_mutex);
- event_call = find_and_get_event(sys_name, sys_event);
- ep = alloc_event_probe(group, event, event_call, argc - 2);
- mutex_unlock(&event_mutex);
+ scoped_guard(mutex, &event_mutex) {
+ event_call = find_and_get_event(sys_name, sys_event);
+ ep = alloc_event_probe(group, event, event_call, argc - 2);
+ }
if (IS_ERR(ep)) {
ret = PTR_ERR(ep);
@@ -952,23 +952,21 @@ static int __trace_eprobe_create(int argc, const char *argv[])
if (ret < 0)
goto error;
init_trace_eprobe_call(ep);
- mutex_lock(&event_mutex);
- ret = trace_probe_register_event_call(&ep->tp);
- if (ret) {
- if (ret == -EEXIST) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(0, EVENT_EXIST);
+ scoped_guard(mutex, &event_mutex) {
+ ret = trace_probe_register_event_call(&ep->tp);
+ if (ret) {
+ if (ret == -EEXIST) {
+ trace_probe_log_set_index(0);
+ trace_probe_log_err(0, EVENT_EXIST);
+ }
+ goto error;
+ }
+ ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
+ if (ret < 0) {
+ trace_probe_unregister_event_call(&ep->tp);
+ goto error;
}
- mutex_unlock(&event_mutex);
- goto error;
- }
- ret = dyn_event_add(&ep->devent, &ep->tp.event->call);
- if (ret < 0) {
- trace_probe_unregister_event_call(&ep->tp);
- mutex_unlock(&event_mutex);
- goto error;
}
- mutex_unlock(&event_mutex);
return ret;
parse_error:
ret = -EINVAL;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 77e68efbd43e2..513de9ceb80ef 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system)
}
static struct ftrace_event_field *
-__find_event_field(struct list_head *head, char *name)
+__find_event_field(struct list_head *head, const char *name)
{
struct ftrace_event_field *field;
@@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name)
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
- int is_signed, int filter_type, int len)
+ int is_signed, int filter_type, int len,
+ int need_test)
{
struct ftrace_event_field *field;
@@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field->offset = offset;
field->size = size;
field->is_signed = is_signed;
+ field->needs_test = need_test;
field->len = len;
list_add(&field->link, head);
@@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type, 0);
+ is_signed, filter_type, 0, 0);
}
EXPORT_SYMBOL_GPL(trace_define_field);
static int trace_define_field_ext(struct trace_event_call *call, const char *type,
const char *name, int offset, int size, int is_signed,
- int filter_type, int len)
+ int filter_type, int len, int need_test)
{
struct list_head *head;
@@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ
head = trace_get_fields(call);
return __trace_define_field(head, type, name, offset, size,
- is_signed, filter_type, len);
+ is_signed, filter_type, len, need_test);
}
#define __generic_field(type, item, filter_type) \
ret = __trace_define_field(&ftrace_generic_fields, #type, \
#item, 0, 0, is_signed_type(type), \
- filter_type, 0); \
+ filter_type, 0, 0); \
if (ret) \
return ret;
@@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ
"common_" #item, \
offsetof(typeof(ent), item), \
sizeof(ent.item), \
- is_signed_type(type), FILTER_OTHER, 0); \
+ is_signed_type(type), FILTER_OTHER, \
+ 0, 0); \
if (ret) \
return ret;
@@ -244,19 +247,16 @@ int trace_event_get_offsets(struct trace_event_call *call)
return tail->offset + tail->size;
}
-/*
- * Check if the referenced field is an array and return true,
- * as arrays are OK to dereference.
- */
-static bool test_field(const char *fmt, struct trace_event_call *call)
+
+static struct trace_event_fields *find_event_field(const char *fmt,
+ struct trace_event_call *call)
{
struct trace_event_fields *field = call->class->fields_array;
- const char *array_descriptor;
const char *p = fmt;
int len;
if (!(len = str_has_prefix(fmt, "REC->")))
- return false;
+ return NULL;
fmt += len;
for (p = fmt; *p; p++) {
if (!isalnum(*p) && *p != '_')
@@ -265,16 +265,141 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
len = p - fmt;
for (; field->type; field++) {
- if (strncmp(field->name, fmt, len) ||
- field->name[len])
+ if (strncmp(field->name, fmt, len) || field->name[len])
continue;
- array_descriptor = strchr(field->type, '[');
- /* This is an array and is OK to dereference. */
- return array_descriptor != NULL;
+
+ return field;
+ }
+ return NULL;
+}
+
+/*
+ * Check if the referenced field is an array and return true,
+ * as arrays are OK to dereference.
+ */
+static bool test_field(const char *fmt, struct trace_event_call *call)
+{
+ struct trace_event_fields *field;
+
+ field = find_event_field(fmt, call);
+ if (!field)
+ return false;
+
+ /* This is an array and is OK to dereference. */
+ return strchr(field->type, '[') != NULL;
+}
+
+/* Look for a string within an argument */
+static bool find_print_string(const char *arg, const char *str, const char *end)
+{
+ const char *r;
+
+ r = strstr(arg, str);
+ return r && r < end;
+}
+
+/* Return true if the argument pointer is safe */
+static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
+{
+ const char *r, *e, *a;
+
+ e = fmt + len;
+
+ /* Find the REC-> in the argument */
+ r = strstr(fmt, "REC->");
+ if (r && r < e) {
+ /*
+ * Addresses of events on the buffer, or an array on the buffer is
+ * OK to dereference. There's ways to fool this, but
+ * this is to catch common mistakes, not malicious code.
+ */
+ a = strchr(fmt, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ return true;
+ } else if (find_print_string(fmt, "__get_dynamic_array(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_sockaddr(", e)) {
+ return true;
+ } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) {
+ return true;
}
return false;
}
+/* Return true if the string is safe */
+static bool process_string(const char *fmt, int len, struct trace_event_call *call)
+{
+ struct trace_event_fields *field;
+ const char *r, *e, *s;
+
+ e = fmt + len;
+
+ /*
+ * There are several helper functions that return strings.
+ * If the argument contains a function, then assume its field is valid.
+ * It is considered that the argument has a function if it has:
+ * alphanumeric or '_' before a parenthesis.
+ */
+ s = fmt;
+ do {
+ r = strstr(s, "(");
+ if (!r || r >= e)
+ break;
+ for (int i = 1; r - i >= s; i++) {
+ char ch = *(r - i);
+ if (isspace(ch))
+ continue;
+ if (isalnum(ch) || ch == '_')
+ return true;
+ /* Anything else, this isn't a function */
+ break;
+ }
+ /* A function could be wrapped in parethesis, try the next one */
+ s = r + 1;
+ } while (s < e);
+
+ /*
+ * Check for arrays. If the argument has: foo[REC->val]
+ * then it is very likely that foo is an array of strings
+ * that are safe to use.
+ */
+ r = strstr(s, "[");
+ if (r && r < e) {
+ r = strstr(r, "REC->");
+ if (r && r < e)
+ return true;
+ }
+
+ /*
+ * If there's any strings in the argument consider this arg OK as it
+ * could be: REC->field ? "foo" : "bar" and we don't want to get into
+ * verifying that logic here.
+ */
+ if (find_print_string(fmt, "\"", e))
+ return true;
+
+ /* Dereferenced strings are also valid like any other pointer */
+ if (process_pointer(fmt, len, call))
+ return true;
+
+ /* Make sure the field is found */
+ field = find_event_field(fmt, call);
+ if (!field)
+ return false;
+
+ /* Test this field's string before printing the event */
+ call->flags |= TRACE_EVENT_FL_TEST_STR;
+ field->needs_test = 1;
+
+ return true;
+}
+
/*
* Examine the print fmt of the event looking for unsafe dereference
* pointers using %p* that could be recorded in the trace event and
@@ -284,13 +409,14 @@ static bool test_field(const char *fmt, struct trace_event_call *call)
static void test_event_printk(struct trace_event_call *call)
{
u64 dereference_flags = 0;
+ u64 string_flags = 0;
bool first = true;
- const char *fmt, *c, *r, *a;
+ const char *fmt;
int parens = 0;
char in_quote = 0;
int start_arg = 0;
int arg = 0;
- int i;
+ int i, e;
fmt = call->print_fmt;
@@ -374,8 +500,16 @@ static void test_event_printk(struct trace_event_call *call)
star = true;
continue;
}
- if ((fmt[i + j] == 's') && star)
- arg++;
+ if ((fmt[i + j] == 's')) {
+ if (star)
+ arg++;
+ if (WARN_ONCE(arg == 63,
+ "Too many args for event: %s",
+ trace_event_name(call)))
+ return;
+ dereference_flags |= 1ULL << arg;
+ string_flags |= 1ULL << arg;
+ }
break;
}
break;
@@ -403,42 +537,47 @@ static void test_event_printk(struct trace_event_call *call)
case ',':
if (in_quote || parens)
continue;
+ e = i;
i++;
while (isspace(fmt[i]))
i++;
- start_arg = i;
- if (!(dereference_flags & (1ULL << arg)))
- goto next_arg;
- /* Find the REC-> in the argument */
- c = strchr(fmt + i, ',');
- r = strstr(fmt + i, "REC->");
- if (r && (!c || r < c)) {
- /*
- * Addresses of events on the buffer,
- * or an array on the buffer is
- * OK to dereference.
- * There's ways to fool this, but
- * this is to catch common mistakes,
- * not malicious code.
- */
- a = strchr(fmt + i, '&');
- if ((a && (a < r)) || test_field(r, call))
+ /*
+ * If start_arg is zero, then this is the start of the
+ * first argument. The processing of the argument happens
+ * when the end of the argument is found, as it needs to
+ * handle paranthesis and such.
+ */
+ if (!start_arg) {
+ start_arg = i;
+ /* Balance out the i++ in the for loop */
+ i--;
+ continue;
+ }
+
+ if (dereference_flags & (1ULL << arg)) {
+ if (string_flags & (1ULL << arg)) {
+ if (process_string(fmt + start_arg, e - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ } else if (process_pointer(fmt + start_arg, e - start_arg, call))
dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
- } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
- (!c || r < c)) {
- dereference_flags &= ~(1ULL << arg);
}
- next_arg:
- i--;
+ start_arg = i;
arg++;
+ /* Balance out the i++ in the for loop */
+ i--;
}
}
+ if (dereference_flags & (1ULL << arg)) {
+ if (string_flags & (1ULL << arg)) {
+ if (process_string(fmt + start_arg, i - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ } else if (process_pointer(fmt + start_arg, i - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ }
+
/*
* If you triggered the below warning, the trace event reported
* uses an unsafe dereference pointer %p*. As the data stored
@@ -730,6 +869,120 @@ static int ftrace_event_enable_disable(struct trace_event_file *file,
return __ftrace_event_enable_disable(file, enable, 0);
}
+#ifdef CONFIG_MODULES
+struct event_mod_load {
+ struct list_head list;
+ char *module;
+ char *match;
+ char *system;
+ char *event;
+};
+
+static void free_event_mod(struct event_mod_load *event_mod)
+{
+ list_del(&event_mod->list);
+ kfree(event_mod->module);
+ kfree(event_mod->match);
+ kfree(event_mod->system);
+ kfree(event_mod->event);
+ kfree(event_mod);
+}
+
+static void clear_mod_events(struct trace_array *tr)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ free_event_mod(event_mod);
+ }
+}
+
+static int remove_cache_mod(struct trace_array *tr, const char *mod,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod, *n;
+ int ret = -EINVAL;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod) != 0)
+ continue;
+
+ if (match && strcmp(event_mod->match, match) != 0)
+ continue;
+
+ if (system &&
+ (!event_mod->system || strcmp(event_mod->system, system) != 0))
+ continue;
+
+ if (event &&
+ (!event_mod->event || strcmp(event_mod->event, event) != 0))
+ continue;
+
+ free_event_mod(event_mod);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ struct event_mod_load *event_mod;
+
+ /* If the module exists, then this just failed to find an event */
+ if (module_exists(mod))
+ return -EINVAL;
+
+ /* See if this is to remove a cached filter */
+ if (!set)
+ return remove_cache_mod(tr, mod, match, system, event);
+
+ event_mod = kzalloc(sizeof(*event_mod), GFP_KERNEL);
+ if (!event_mod)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&event_mod->list);
+ event_mod->module = kstrdup(mod, GFP_KERNEL);
+ if (!event_mod->module)
+ goto out_free;
+
+ if (match) {
+ event_mod->match = kstrdup(match, GFP_KERNEL);
+ if (!event_mod->match)
+ goto out_free;
+ }
+
+ if (system) {
+ event_mod->system = kstrdup(system, GFP_KERNEL);
+ if (!event_mod->system)
+ goto out_free;
+ }
+
+ if (event) {
+ event_mod->event = kstrdup(event, GFP_KERNEL);
+ if (!event_mod->event)
+ goto out_free;
+ }
+
+ list_add(&event_mod->list, &tr->mod_events);
+
+ return 0;
+
+ out_free:
+ free_event_mod(event_mod);
+
+ return -ENOMEM;
+}
+#else /* CONFIG_MODULES */
+static inline void clear_mod_events(struct trace_array *tr) { }
+static int cache_mod(struct trace_array *tr, const char *mod, int set,
+ const char *match, const char *system, const char *event)
+{
+ return -EINVAL;
+}
+#endif
+
static void ftrace_clear_events(struct trace_array *tr)
{
struct trace_event_file *file;
@@ -738,6 +991,7 @@ static void ftrace_clear_events(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ftrace_event_enable_disable(file, 0);
}
+ clear_mod_events(tr);
mutex_unlock(&event_mutex);
}
@@ -1026,17 +1280,36 @@ static void remove_event_file_dir(struct trace_event_file *file)
*/
static int
__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
struct trace_event_file *file;
struct trace_event_call *call;
+ char *module __free(kfree) = NULL;
const char *name;
int ret = -EINVAL;
int eret = 0;
+ if (mod) {
+ char *p;
+
+ module = kstrdup(mod, GFP_KERNEL);
+ if (!module)
+ return -ENOMEM;
+
+ /* Replace all '-' with '_' as that's what modules do */
+ for (p = strchr(module, '-'); p; p = strchr(p + 1, '-'))
+ *p = '_';
+ }
+
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
+
+ /* If a module is specified, skip events that are not that module */
+ if (module && (!call->module || strcmp(module_name(call->module), module)))
+ continue;
+
name = trace_event_name(call);
if (!name || !call->class || !call->class->reg)
@@ -1069,16 +1342,24 @@ __ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
ret = eret;
}
+ /*
+ * If this is a module setting and nothing was found,
+ * check if the module was loaded. If it wasn't cache it.
+ */
+ if (module && ret == -EINVAL && !eret)
+ ret = cache_mod(tr, module, set, match, sub, event);
+
return ret;
}
static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+ const char *sub, const char *event, int set,
+ const char *mod)
{
int ret;
mutex_lock(&event_mutex);
- ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set, mod);
mutex_unlock(&event_mutex);
return ret;
@@ -1086,11 +1367,20 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
- char *event = NULL, *sub = NULL, *match;
+ char *event = NULL, *sub = NULL, *match, *mod;
int ret;
if (!tr)
return -ENOENT;
+
+ /* Modules events can be appened with :mod:<module> */
+ mod = strstr(buf, ":mod:");
+ if (mod) {
+ *mod = '\0';
+ /* move to the module name */
+ mod += 5;
+ }
+
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
@@ -1113,9 +1403,13 @@ int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
sub = NULL;
if (!strlen(event) || strcmp(event, "*") == 0)
event = NULL;
+ } else if (mod) {
+ /* Allow wildcard for no length or star */
+ if (!strlen(match) || strcmp(match, "*") == 0)
+ match = NULL;
}
- ret = __ftrace_set_clr_event(tr, match, sub, event, set);
+ ret = __ftrace_set_clr_event(tr, match, sub, event, set, mod);
/* Put back the colon to allow this to be called again */
if (buf)
@@ -1143,7 +1437,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
if (!tr)
return -ENODEV;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -1169,7 +1463,7 @@ int trace_array_set_clr_event(struct trace_array *tr, const char *system,
return -ENOENT;
set = (enable == true) ? 1 : 0;
- return __ftrace_set_clr_event(tr, NULL, system, event, set);
+ return __ftrace_set_clr_event(tr, NULL, system, event, set, NULL);
}
EXPORT_SYMBOL_GPL(trace_array_set_clr_event);
@@ -1256,37 +1550,78 @@ static void *t_start(struct seq_file *m, loff_t *pos)
return file;
}
+enum set_event_iter_type {
+ SET_EVENT_FILE,
+ SET_EVENT_MOD,
+};
+
+struct set_event_iter {
+ enum set_event_iter_type type;
+ union {
+ struct trace_event_file *file;
+ struct event_mod_load *event_mod;
+ };
+};
+
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct trace_event_file *file = v;
+ struct set_event_iter *iter = v;
+ struct trace_event_file *file;
struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(file, &tr->events, list) {
- if (file->flags & EVENT_FILE_FL_ENABLED)
- return file;
+ if (iter->type == SET_EVENT_FILE) {
+ file = iter->file;
+ list_for_each_entry_continue(file, &tr->events, list) {
+ if (file->flags & EVENT_FILE_FL_ENABLED) {
+ iter->file = file;
+ return iter;
+ }
+ }
+#ifdef CONFIG_MODULES
+ iter->type = SET_EVENT_MOD;
+ iter->event_mod = list_entry(&tr->mod_events, struct event_mod_load, list);
+#endif
}
+#ifdef CONFIG_MODULES
+ list_for_each_entry_continue(iter->event_mod, &tr->mod_events, list)
+ return iter;
+#endif
+
+ /*
+ * The iter is allocated in s_start() and passed via the 'v'
+ * parameter. To stop the iterator, NULL must be returned. But
+ * the return value is what the 'v' parameter in s_stop() receives
+ * and frees. Free iter here as it will no longer be used.
+ */
+ kfree(iter);
return NULL;
}
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct trace_event_file *file;
struct trace_array *tr = m->private;
+ struct set_event_iter *iter;
loff_t l;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
mutex_lock(&event_mutex);
- file = list_entry(&tr->events, struct trace_event_file, list);
+ iter->type = SET_EVENT_FILE;
+ iter->file = list_entry(&tr->events, struct trace_event_file, list);
+
for (l = 0; l <= *pos; ) {
- file = s_next(m, file, &l);
- if (!file)
+ iter = s_next(m, iter, &l);
+ if (!iter)
break;
}
- return file;
+ return iter;
}
static int t_show(struct seq_file *m, void *v)
@@ -1306,6 +1641,45 @@ static void t_stop(struct seq_file *m, void *p)
mutex_unlock(&event_mutex);
}
+#ifdef CONFIG_MODULES
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+ const char *system;
+ const char *event;
+
+ if (iter->type == SET_EVENT_FILE)
+ return t_show(m, iter->file);
+
+ /* When match is set, system and event are not */
+ if (iter->event_mod->match) {
+ seq_printf(m, "%s:mod:%s\n", iter->event_mod->match,
+ iter->event_mod->module);
+ return 0;
+ }
+
+ system = iter->event_mod->system ? : "*";
+ event = iter->event_mod->event ? : "*";
+
+ seq_printf(m, "%s:%s:mod:%s\n", system, event, iter->event_mod->module);
+
+ return 0;
+}
+#else /* CONFIG_MODULES */
+static int s_show(struct seq_file *m, void *v)
+{
+ struct set_event_iter *iter = v;
+
+ return t_show(m, iter->file);
+}
+#endif
+
+static void s_stop(struct seq_file *m, void *v)
+{
+ kfree(v);
+ t_stop(m, NULL);
+}
+
static void *
__next(struct seq_file *m, void *v, loff_t *pos, int type)
{
@@ -1419,21 +1793,20 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
switch (val) {
case 0:
case 1:
- ret = -ENODEV;
- mutex_lock(&event_mutex);
file = event_file_file(filp);
- if (likely(file)) {
- ret = tracing_update_buffers(file->tr);
- if (ret < 0) {
- mutex_unlock(&event_mutex);
- return ret;
- }
- ret = ftrace_event_enable_disable(file, val);
- }
- mutex_unlock(&event_mutex);
+ if (!file)
+ return -ENODEV;
+ ret = tracing_update_buffers(file->tr);
+ if (ret < 0)
+ return ret;
+ ret = ftrace_event_enable_disable(file, val);
+ if (ret < 0)
+ return ret;
break;
default:
@@ -1442,7 +1815,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
*ppos += cnt;
- return ret ? ret : cnt;
+ return cnt;
}
static ssize_t
@@ -1520,7 +1893,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (system)
name = system->name;
- ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
+ ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val, NULL);
if (ret)
goto out;
@@ -2018,7 +2391,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
if (ret < 0)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
if (type == TRACE_PIDS) {
filtered_pids = rcu_dereference_protected(tr->filtered_pids,
@@ -2034,7 +2407,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
ret = trace_pid_write(filtered_pids, &pid_list, ubuf, cnt);
if (ret < 0)
- goto out;
+ return ret;
if (type == TRACE_PIDS)
rcu_assign_pointer(tr->filtered_pids, pid_list);
@@ -2059,11 +2432,7 @@ event_pid_write(struct file *filp, const char __user *ubuf,
*/
on_each_cpu(ignore_task_cpu, tr, 1);
- out:
- mutex_unlock(&event_mutex);
-
- if (ret > 0)
- *ppos += ret;
+ *ppos += ret;
return ret;
}
@@ -2098,8 +2467,8 @@ static const struct seq_operations show_event_seq_ops = {
static const struct seq_operations show_set_event_seq_ops = {
.start = s_start,
.next = s_next,
- .show = t_show,
- .stop = t_stop,
+ .show = s_show,
+ .stop = s_stop,
};
static const struct seq_operations show_set_pid_seq_ops = {
@@ -2471,7 +2840,7 @@ event_define_fields(struct trace_event_call *call)
ret = trace_define_field_ext(call, field->type, field->name,
offset, field->size,
field->is_signed, field->filter_type,
- field->len);
+ field->len, field->needs_test);
if (WARN_ON_ONCE(ret)) {
pr_err("error code is %d\n", ret);
break;
@@ -2972,6 +3341,20 @@ static bool event_in_systems(struct trace_event_call *call,
return !*p || isspace(*p) || *p == ',';
}
+#ifdef CONFIG_HIST_TRIGGERS
+/*
+ * Wake up waiter on the hist_poll_wq from irq_work because the hist trigger
+ * may happen in any context.
+ */
+static void hist_poll_event_irq_work(struct irq_work *work)
+{
+ wake_up_all(&hist_poll_wq);
+}
+
+DEFINE_IRQ_WORK(hist_poll_work, hist_poll_event_irq_work);
+DECLARE_WAIT_QUEUE_HEAD(hist_poll_wq);
+#endif
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -3130,13 +3513,13 @@ int trace_add_event_call(struct trace_event_call *call)
int ret;
lockdep_assert_held(&event_mutex);
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
ret = __register_event(call, NULL);
- if (ret >= 0)
- __add_event_to_tracers(call);
+ if (ret < 0)
+ return ret;
- mutex_unlock(&trace_types_lock);
+ __add_event_to_tracers(call);
return ret;
}
EXPORT_SYMBOL_GPL(trace_add_event_call);
@@ -3216,6 +3599,28 @@ EXPORT_SYMBOL_GPL(trace_remove_event_call);
event++)
#ifdef CONFIG_MODULES
+static void update_mod_cache(struct trace_array *tr, struct module *mod)
+{
+ struct event_mod_load *event_mod, *n;
+
+ list_for_each_entry_safe(event_mod, n, &tr->mod_events, list) {
+ if (strcmp(event_mod->module, mod->name) != 0)
+ continue;
+
+ __ftrace_set_clr_event_nolock(tr, event_mod->match,
+ event_mod->system,
+ event_mod->event, 1, mod->name);
+ free_event_mod(event_mod);
+ }
+}
+
+static void update_cache_events(struct module *mod)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list)
+ update_mod_cache(tr, mod);
+}
static void trace_module_add_events(struct module *mod)
{
@@ -3238,6 +3643,8 @@ static void trace_module_add_events(struct module *mod)
__register_event(*call, mod);
__add_event_to_tracers(*call);
}
+
+ update_cache_events(mod);
}
static void trace_module_remove_events(struct module *mod)
@@ -3390,30 +3797,21 @@ struct trace_event_file *trace_get_event_file(const char *instance,
return ERR_PTR(ret);
}
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
file = find_event_file(tr, system, event);
if (!file) {
trace_array_put(tr);
- ret = -EINVAL;
- goto out;
+ return ERR_PTR(-EINVAL);
}
/* Don't let event modules unload while in use */
ret = trace_event_try_get_ref(file->event_call);
if (!ret) {
trace_array_put(tr);
- ret = -EBUSY;
- goto out;
+ return ERR_PTR(-EBUSY);
}
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
-
- if (ret)
- file = ERR_PTR(ret);
-
return file;
}
EXPORT_SYMBOL_GPL(trace_get_event_file);
@@ -3631,6 +4029,7 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
struct trace_event_file *file;
struct ftrace_probe_ops *ops;
struct event_probe_data *data;
+ unsigned long count = -1;
const char *system;
const char *event;
char *number;
@@ -3650,12 +4049,11 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
event = strsep(&param, ":");
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- ret = -EINVAL;
file = find_event_file(tr, system, event);
if (!file)
- goto out;
+ return -EINVAL;
enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
@@ -3664,74 +4062,62 @@ event_enable_func(struct trace_array *tr, struct ftrace_hash *hash,
else
ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
- if (glob[0] == '!') {
- ret = unregister_ftrace_function_probe_func(glob+1, tr, ops);
- goto out;
- }
+ if (glob[0] == '!')
+ return unregister_ftrace_function_probe_func(glob+1, tr, ops);
- ret = -ENOMEM;
+ if (param) {
+ number = strsep(&param, ":");
- data = kzalloc(sizeof(*data), GFP_KERNEL);
- if (!data)
- goto out;
+ if (!strlen(number))
+ return -EINVAL;
- data->enable = enable;
- data->count = -1;
- data->file = file;
-
- if (!param)
- goto out_reg;
-
- number = strsep(&param, ":");
-
- ret = -EINVAL;
- if (!strlen(number))
- goto out_free;
-
- /*
- * We use the callback data field (which is a pointer)
- * as our counter.
- */
- ret = kstrtoul(number, 0, &data->count);
- if (ret)
- goto out_free;
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &count);
+ if (ret)
+ return ret;
+ }
- out_reg:
/* Don't let event modules unload while probe registered */
ret = trace_event_try_get_ref(file->event_call);
- if (!ret) {
- ret = -EBUSY;
- goto out_free;
- }
+ if (!ret)
+ return -EBUSY;
ret = __ftrace_event_enable_disable(file, 1, 1);
if (ret < 0)
goto out_put;
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out_put;
+
+ data->enable = enable;
+ data->count = count;
+ data->file = file;
+
ret = register_ftrace_function_probe(glob, tr, ops, data);
/*
* The above returns on success the # of functions enabled,
* but if it didn't find any functions it returns zero.
* Consider no functions a failure too.
*/
- if (!ret) {
- ret = -ENOENT;
- goto out_disable;
- } else if (ret < 0)
- goto out_disable;
+
/* Just return zero, not the number of enabled functions */
- ret = 0;
- out:
- mutex_unlock(&event_mutex);
- return ret;
+ if (ret > 0)
+ return 0;
+
+ kfree(data);
+
+ if (!ret)
+ ret = -ENOENT;
- out_disable:
__ftrace_event_enable_disable(file, 0, 1);
out_put:
trace_event_put_ref(file->event_call);
- out_free:
- kfree(data);
- goto out;
+ return ret;
}
static struct ftrace_func_command event_enable_cmd = {
@@ -3954,20 +4340,17 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
{
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = create_event_toplevel_files(parent, tr);
if (ret)
- goto out_unlock;
+ return ret;
down_write(&trace_event_sem);
__trace_early_add_event_dirs(tr);
up_write(&trace_event_sem);
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
/* Must be called with event_mutex held */
@@ -3982,7 +4365,7 @@ int event_trace_del_tracer(struct trace_array *tr)
__ftrace_clear_event_pids(tr, TRACE_PIDS | TRACE_NO_PIDS);
/* Disable any running events */
- __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0, NULL);
/* Make sure no more events are being executed */
tracepoint_synchronize_unregister();
@@ -4266,7 +4649,7 @@ static __init void event_trace_self_tests(void)
pr_info("Testing event system %s: ", system->name);
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling system %s\n",
system->name);
@@ -4275,7 +4658,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
- ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling system %s\n",
system->name);
@@ -4290,7 +4673,7 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on all trace events:\n");
pr_info("Testing all events: ");
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error enabling all events\n");
return;
@@ -4299,7 +4682,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
/* reset sysname */
- ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0, NULL);
if (WARN_ON_ONCE(ret)) {
pr_warn("error disabling all events\n");
return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 78051de581e78..0993dfc1c5c16 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -2405,13 +2405,11 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
struct event_filter *filter = NULL;
int err = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
/* Make sure the system still has events */
- if (!dir->nr_events) {
- err = -ENODEV;
- goto out_unlock;
- }
+ if (!dir->nr_events)
+ return -ENODEV;
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(dir, tr);
@@ -2422,7 +2420,7 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
tracepoint_synchronize_unregister();
filter_free_subsystem_filters(dir, tr);
__free_filter(filter);
- goto out_unlock;
+ return 0;
}
err = create_system_filter(dir, filter_string, &filter);
@@ -2434,8 +2432,6 @@ int apply_subsystem_event_filter(struct trace_subsystem_dir *dir,
__free_filter(system->filter);
system->filter = filter;
}
-out_unlock:
- mutex_unlock(&event_mutex);
return err;
}
@@ -2612,17 +2608,15 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
struct event_filter *filter = NULL;
struct trace_event_call *call;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
call = event->tp_event;
- err = -EINVAL;
if (!call)
- goto out_unlock;
+ return -EINVAL;
- err = -EEXIST;
if (event->filter)
- goto out_unlock;
+ return -EEXIST;
err = create_filter(NULL, call, filter_str, false, &filter);
if (err)
@@ -2637,9 +2631,6 @@ free_filter:
if (err || ftrace_event_is_function(call))
__free_filter(filter);
-out_unlock:
- mutex_unlock(&event_mutex);
-
return err;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 9c058aa8baf33..261163b00137a 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -5311,6 +5311,8 @@ static void event_hist_trigger(struct event_trigger_data *data,
if (resolve_var_refs(hist_data, key, var_ref_vals, true))
hist_trigger_actions(hist_data, elt, buffer, rec, rbe, key, var_ref_vals);
+
+ hist_poll_wakeup();
}
static void hist_trigger_stacktrace_print(struct seq_file *m,
@@ -5590,49 +5592,128 @@ static void hist_trigger_show(struct seq_file *m,
n_entries, (u64)atomic64_read(&hist_data->map->drops));
}
+struct hist_file_data {
+ struct file *file;
+ u64 last_read;
+ u64 last_act;
+};
+
+static u64 get_hist_hit_count(struct trace_event_file *event_file)
+{
+ struct hist_trigger_data *hist_data;
+ struct event_trigger_data *data;
+ u64 ret = 0;
+
+ list_for_each_entry(data, &event_file->triggers, list) {
+ if (data->cmd_ops->trigger_type == ETT_EVENT_HIST) {
+ hist_data = data->private_data;
+ ret += atomic64_read(&hist_data->map->hits);
+ }
+ }
+ return ret;
+}
+
static int hist_show(struct seq_file *m, void *v)
{
+ struct hist_file_data *hist_file = m->private;
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- event_file = event_file_file(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ event_file = event_file_file(hist_file->file);
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_show(m, data, n++);
}
+ hist_file->last_read = get_hist_hit_count(event_file);
+ /*
+ * Update last_act too so that poll()/POLLPRI can wait for the next
+ * event after any syscall on hist file.
+ */
+ hist_file->last_act = hist_file->last_read;
+
+ return 0;
+}
+
+static __poll_t event_hist_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct trace_event_file *event_file;
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+ __poll_t ret = 0;
+ u64 cnt;
+
+ guard(mutex)(&event_mutex);
- out_unlock:
- mutex_unlock(&event_mutex);
+ event_file = event_file_data(file);
+ if (!event_file)
+ return EPOLLERR;
+
+ hist_poll_wait(file, wait);
+
+ cnt = get_hist_hit_count(event_file);
+ if (hist_file->last_read != cnt)
+ ret |= EPOLLIN | EPOLLRDNORM;
+ if (hist_file->last_act != cnt) {
+ hist_file->last_act = cnt;
+ ret |= EPOLLPRI;
+ }
return ret;
}
+static int event_hist_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+ struct hist_file_data *hist_file = m->private;
+
+ kfree(hist_file);
+ return tracing_single_release_file_tr(inode, file);
+}
+
static int event_hist_open(struct inode *inode, struct file *file)
{
+ struct trace_event_file *event_file;
+ struct hist_file_data *hist_file;
int ret;
ret = tracing_open_file_tr(inode, file);
if (ret)
return ret;
+ guard(mutex)(&event_mutex);
+
+ event_file = event_file_data(file);
+ if (!event_file)
+ return -ENODEV;
+
+ hist_file = kzalloc(sizeof(*hist_file), GFP_KERNEL);
+ if (!hist_file)
+ return -ENOMEM;
+
+ hist_file->file = file;
+ hist_file->last_act = get_hist_hit_count(event_file);
+
/* Clear private_data to avoid warning in single_open() */
file->private_data = NULL;
- return single_open(file, hist_show, file);
+ ret = single_open(file, hist_show, hist_file);
+ if (ret)
+ kfree(hist_file);
+
+ return ret;
}
const struct file_operations event_hist_fops = {
.open = event_hist_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = tracing_single_release_file_tr,
+ .release = event_hist_release,
+ .poll = event_hist_poll,
};
#ifdef CONFIG_HIST_TRIGGERS_DEBUG
@@ -5873,25 +5954,19 @@ static int hist_debug_show(struct seq_file *m, void *v)
{
struct event_trigger_data *data;
struct trace_event_file *event_file;
- int n = 0, ret = 0;
+ int n = 0;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
event_file = event_file_file(m->private);
- if (unlikely(!event_file)) {
- ret = -ENODEV;
- goto out_unlock;
- }
+ if (unlikely(!event_file))
+ return -ENODEV;
list_for_each_entry(data, &event_file->triggers, list) {
if (data->cmd_ops->trigger_type == ETT_EVENT_HIST)
hist_trigger_debug_show(m, data, n++);
}
-
- out_unlock:
- mutex_unlock(&event_mutex);
-
- return ret;
+ return 0;
}
static int event_hist_debug_open(struct inode *inode, struct file *file)
diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c
index c82b401a294d9..e3f7d09e55120 100644
--- a/kernel/trace/trace_events_synth.c
+++ b/kernel/trace/trace_events_synth.c
@@ -49,16 +49,11 @@ static char *last_cmd;
static int errpos(const char *str)
{
- int ret = 0;
-
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!str || !last_cmd)
- goto out;
+ return 0;
- ret = err_pos(last_cmd, str);
- out:
- mutex_unlock(&lastcmd_mutex);
- return ret;
+ return err_pos(last_cmd, str);
}
static void last_cmd_set(const char *str)
@@ -74,14 +69,12 @@ static void last_cmd_set(const char *str)
static void synth_err(u8 err_type, u16 err_pos)
{
- mutex_lock(&lastcmd_mutex);
+ guard(mutex)(&lastcmd_mutex);
if (!last_cmd)
- goto out;
+ return;
tracing_log_err(NULL, "synthetic_events", last_cmd, err_text,
err_type, err_pos);
- out:
- mutex_unlock(&lastcmd_mutex);
}
static int create_synth_event(const char *raw_command);
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index a5e3d6acf1e1e..d454489470946 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -211,12 +211,10 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
if (ret)
return ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
- if (unlikely(!event_file_file(file))) {
- mutex_unlock(&event_mutex);
+ if (unlikely(!event_file_file(file)))
return -ENODEV;
- }
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC)) {
@@ -239,8 +237,6 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
}
}
- mutex_unlock(&event_mutex);
-
return ret;
}
@@ -248,7 +244,6 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
{
char *command, *next;
struct event_command *p;
- int ret = -EINVAL;
next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
@@ -259,17 +254,14 @@ int trigger_process_regex(struct trace_event_file *file, char *buff)
}
command = (command[0] != '!') ? command : command + 1;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(p->name, command) == 0) {
- ret = p->parse(p, file, buff, command, next);
- goto out_unlock;
- }
+ if (strcmp(p->name, command) == 0)
+ return p->parse(p, file, buff, command, next);
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -EINVAL;
}
static ssize_t event_trigger_regex_write(struct file *file,
@@ -278,7 +270,7 @@ static ssize_t event_trigger_regex_write(struct file *file,
{
struct trace_event_file *event_file;
ssize_t ret;
- char *buf;
+ char *buf __free(kfree) = NULL;
if (!cnt)
return 0;
@@ -292,24 +284,18 @@ static ssize_t event_trigger_regex_write(struct file *file,
strim(buf);
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
+
event_file = event_file_file(file);
- if (unlikely(!event_file)) {
- mutex_unlock(&event_mutex);
- kfree(buf);
+ if (unlikely(!event_file))
return -ENODEV;
- }
- ret = trigger_process_regex(event_file, buf);
- mutex_unlock(&event_mutex);
- kfree(buf);
+ ret = trigger_process_regex(event_file, buf);
if (ret < 0)
- goto out;
+ return ret;
*ppos += cnt;
- ret = cnt;
- out:
- return ret;
+ return cnt;
}
static int event_trigger_regex_release(struct inode *inode, struct file *file)
@@ -359,20 +345,16 @@ const struct file_operations event_trigger_fops = {
__init int register_event_command(struct event_command *cmd)
{
struct event_command *p;
- int ret = 0;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry(p, &trigger_commands, list) {
- if (strcmp(cmd->name, p->name) == 0) {
- ret = -EBUSY;
- goto out_unlock;
- }
+ if (strcmp(cmd->name, p->name) == 0)
+ return -EBUSY;
}
list_add(&cmd->list, &trigger_commands);
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return 0;
}
/*
@@ -382,20 +364,17 @@ __init int register_event_command(struct event_command *cmd)
__init int unregister_event_command(struct event_command *cmd)
{
struct event_command *p, *n;
- int ret = -ENODEV;
- mutex_lock(&trigger_cmd_mutex);
+ guard(mutex)(&trigger_cmd_mutex);
+
list_for_each_entry_safe(p, n, &trigger_commands, list) {
if (strcmp(cmd->name, p->name) == 0) {
- ret = 0;
list_del_init(&p->list);
- goto out_unlock;
+ return 0;
}
}
- out_unlock:
- mutex_unlock(&trigger_cmd_mutex);
- return ret;
+ return -ENODEV;
}
/**
diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c
index 17bcad8f79de7..97325fbd62836 100644
--- a/kernel/trace/trace_events_user.c
+++ b/kernel/trace/trace_events_user.c
@@ -2899,7 +2899,7 @@ static int set_max_user_events_sysctl(const struct ctl_table *table, int write,
return ret;
}
-static struct ctl_table user_event_sysctls[] = {
+static const struct ctl_table user_event_sysctls[] = {
{
.procname = "user_events_max",
.data = &max_user_events,
diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c
index c62d1629cffec..b8f3c4ba309b6 100644
--- a/kernel/trace/trace_fprobe.c
+++ b/kernel/trace/trace_fprobe.c
@@ -134,7 +134,7 @@ static int
process_fetch_insn(struct fetch_insn *code, void *rec, void *edata,
void *dest, void *base)
{
- struct pt_regs *regs = rec;
+ struct ftrace_regs *fregs = rec;
unsigned long val;
int ret;
@@ -142,17 +142,17 @@ retry:
/* 1st stage: get value from context */
switch (code->op) {
case FETCH_OP_STACK:
- val = regs_get_kernel_stack_nth(regs, code->param);
+ val = ftrace_regs_get_kernel_stack_nth(fregs, code->param);
break;
case FETCH_OP_STACKP:
- val = kernel_stack_pointer(regs);
+ val = ftrace_regs_get_stack_pointer(fregs);
break;
case FETCH_OP_RETVAL:
- val = regs_return_value(regs);
+ val = ftrace_regs_get_return_value(fregs);
break;
#ifdef CONFIG_HAVE_FUNCTION_ARG_ACCESS_API
case FETCH_OP_ARG:
- val = regs_get_kernel_argument(regs, code->param);
+ val = ftrace_regs_get_argument(fregs, code->param);
break;
case FETCH_OP_EDATA:
val = *(unsigned long *)((unsigned long)edata + code->offset);
@@ -175,7 +175,7 @@ NOKPROBE_SYMBOL(process_fetch_insn)
/* function entry handler */
static nokprobe_inline void
__fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs,
+ struct ftrace_regs *fregs,
struct trace_event_file *trace_file)
{
struct fentry_trace_entry_head *entry;
@@ -189,41 +189,71 @@ __fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs, NULL);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->ip = entry_ip;
- store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fentry_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fentry_trace_func(tf, entry_ip, regs, link->file);
+ __fentry_trace_func(tf, entry_ip, fregs, link->file);
}
NOKPROBE_SYMBOL(fentry_trace_func);
+static nokprobe_inline
+void store_fprobe_entry_data(void *edata, struct trace_probe *tp, struct ftrace_regs *fregs)
+{
+ struct probe_entry_arg *earg = tp->entry_arg;
+ unsigned long val = 0;
+ int i;
+
+ if (!earg)
+ return;
+
+ for (i = 0; i < earg->size; i++) {
+ struct fetch_insn *code = &earg->code[i];
+
+ switch (code->op) {
+ case FETCH_OP_ARG:
+ val = ftrace_regs_get_argument(fregs, code->param);
+ break;
+ case FETCH_OP_ST_EDATA:
+ *(unsigned long *)((unsigned long)edata + code->offset) = val;
+ break;
+ case FETCH_OP_END:
+ goto end;
+ default:
+ break;
+ }
+ }
+end:
+ return;
+}
+
/* function exit handler */
static int trace_fprobe_entry_handler(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (tf->tp.entry_arg)
- store_trace_entry_data(entry_data, &tf->tp, regs);
+ store_fprobe_entry_data(entry_data, &tf->tp, fregs);
return 0;
}
@@ -231,7 +261,7 @@ NOKPROBE_SYMBOL(trace_fprobe_entry_handler)
static nokprobe_inline void
__fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data, struct trace_event_file *trace_file)
{
struct fexit_trace_entry_head *entry;
@@ -245,60 +275,63 @@ __fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
if (trace_trigger_soft_disabled(trace_file))
return;
- dsize = __get_data_size(&tf->tp, regs, entry_data);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
entry = trace_event_buffer_reserve(&fbuffer, trace_file,
sizeof(*entry) + tf->tp.size + dsize);
if (!entry)
return;
- fbuffer.regs = regs;
+ fbuffer.regs = ftrace_get_regs(fregs);
entry = fbuffer.entry = ring_buffer_event_data(fbuffer.event);
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
trace_event_buffer_commit(&fbuffer);
}
static void
fexit_trace_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs, void *entry_data)
+ unsigned long ret_ip, struct ftrace_regs *fregs, void *entry_data)
{
struct event_file_link *link;
trace_probe_for_each_link_rcu(link, &tf->tp)
- __fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data, link->file);
+ __fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data, link->file);
}
NOKPROBE_SYMBOL(fexit_trace_func);
#ifdef CONFIG_PERF_EVENTS
static int fentry_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- struct pt_regs *regs)
+ struct ftrace_regs *fregs)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fentry_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return 0;
- dsize = __get_data_size(&tf->tp, regs, NULL);
+ dsize = __get_data_size(&tf->tp, fregs, NULL);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return 0;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->ip = entry_ip;
memset(&entry[1], 0, dsize);
- store_trace_args(&entry[1], &tf->tp, regs, NULL, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, NULL, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
return 0;
@@ -307,31 +340,34 @@ NOKPROBE_SYMBOL(fentry_perf_func);
static void
fexit_perf_func(struct trace_fprobe *tf, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_event_call *call = trace_probe_event_call(&tf->tp);
struct fexit_trace_entry_head *entry;
struct hlist_head *head;
int size, __size, dsize;
+ struct pt_regs *regs;
int rctx;
head = this_cpu_ptr(call->perf_events);
if (hlist_empty(head))
return;
- dsize = __get_data_size(&tf->tp, regs, entry_data);
+ dsize = __get_data_size(&tf->tp, fregs, entry_data);
__size = sizeof(*entry) + tf->tp.size + dsize;
size = ALIGN(__size + sizeof(u32), sizeof(u64));
size -= sizeof(u32);
- entry = perf_trace_buf_alloc(size, NULL, &rctx);
+ entry = perf_trace_buf_alloc(size, &regs, &rctx);
if (!entry)
return;
+ regs = ftrace_fill_perf_regs(fregs, regs);
+
entry->func = entry_ip;
entry->ret_ip = ret_ip;
- store_trace_args(&entry[1], &tf->tp, regs, entry_data, sizeof(*entry), dsize);
+ store_trace_args(&entry[1], &tf->tp, fregs, entry_data, sizeof(*entry), dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
head, NULL);
}
@@ -339,33 +375,34 @@ NOKPROBE_SYMBOL(fexit_perf_func);
#endif /* CONFIG_PERF_EVENTS */
static int fentry_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
int ret = 0;
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fentry_trace_func(tf, entry_ip, regs);
+ fentry_trace_func(tf, entry_ip, fregs);
+
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- ret = fentry_perf_func(tf, entry_ip, regs);
+ ret = fentry_perf_func(tf, entry_ip, fregs);
#endif
return ret;
}
NOKPROBE_SYMBOL(fentry_dispatcher);
static void fexit_dispatcher(struct fprobe *fp, unsigned long entry_ip,
- unsigned long ret_ip, struct pt_regs *regs,
+ unsigned long ret_ip, struct ftrace_regs *fregs,
void *entry_data)
{
struct trace_fprobe *tf = container_of(fp, struct trace_fprobe, fp);
if (trace_probe_test_flag(&tf->tp, TP_FLAG_TRACE))
- fexit_trace_func(tf, entry_ip, ret_ip, regs, entry_data);
+ fexit_trace_func(tf, entry_ip, ret_ip, fregs, entry_data);
#ifdef CONFIG_PERF_EVENTS
if (trace_probe_test_flag(&tf->tp, TP_FLAG_PROFILE))
- fexit_perf_func(tf, entry_ip, ret_ip, regs, entry_data);
+ fexit_perf_func(tf, entry_ip, ret_ip, fregs, entry_data);
#endif
}
NOKPROBE_SYMBOL(fexit_dispatcher);
@@ -379,6 +416,9 @@ static void free_trace_fprobe(struct trace_fprobe *tf)
}
}
+/* Since alloc_trace_fprobe() can return error, check the pointer is ERR too. */
+DEFINE_FREE(free_trace_fprobe, struct trace_fprobe *, if (!IS_ERR_OR_NULL(_T)) free_trace_fprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including fprobe).
*/
@@ -387,10 +427,9 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
const char *symbol,
struct tracepoint *tpoint,
struct module *mod,
- int maxactive,
int nargs, bool is_return)
{
- struct trace_fprobe *tf;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
int ret = -ENOMEM;
tf = kzalloc(struct_size(tf, tp.args, nargs), GFP_KERNEL);
@@ -399,7 +438,7 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tf->symbol)
- goto error;
+ return ERR_PTR(-ENOMEM);
if (is_return)
tf->fp.exit_handler = fexit_dispatcher;
@@ -408,17 +447,13 @@ static struct trace_fprobe *alloc_trace_fprobe(const char *group,
tf->tpoint = tpoint;
tf->mod = mod;
- tf->fp.nr_maxactive = maxactive;
ret = trace_probe_init(&tf->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tf->devent, &trace_fprobe_ops);
- return tf;
-error:
- free_trace_fprobe(tf);
- return ERR_PTR(ret);
+ return_ptr(tf);
}
static struct trace_fprobe *find_trace_fprobe(const char *event,
@@ -845,14 +880,12 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
struct trace_fprobe *old_tf;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tf = find_trace_fprobe(trace_probe_name(&tf->tp),
trace_probe_group_name(&tf->tp));
- if (old_tf) {
- ret = append_trace_fprobe(tf, old_tf);
- goto end;
- }
+ if (old_tf)
+ return append_trace_fprobe(tf, old_tf);
/* Register new event */
ret = register_fprobe_event(tf);
@@ -862,7 +895,7 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
/* Register fprobe */
@@ -872,8 +905,6 @@ static int register_trace_fprobe(struct trace_fprobe *tf)
else
dyn_event_add(&tf->devent, trace_probe_event_call(&tf->tp));
-end:
- mutex_unlock(&event_mutex);
return ret;
}
@@ -1034,7 +1065,10 @@ static int parse_symbol_and_return(int argc, const char *argv[],
return 0;
}
-static int __trace_fprobe_create(int argc, const char *argv[])
+DEFINE_FREE(module_put, struct module *, if (_T) module_put(_T))
+
+static int trace_fprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -1060,24 +1094,20 @@ static int __trace_fprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_fprobe *tf = NULL;
- int i, len, new_argc = 0, ret = 0;
+ struct trace_fprobe *tf __free(free_trace_fprobe) = NULL;
+ int i, new_argc = 0, ret = 0;
bool is_return = false;
- char *symbol = NULL;
+ char *symbol __free(kfree) = NULL;
const char *event = NULL, *group = FPROBE_EVENT_SYSTEM;
- const char **new_argv = NULL;
- int maxactive = 0;
+ const char **new_argv __free(kfree) = NULL;
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char sbuf[KSYM_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
- char *dbuf = NULL;
+ char *dbuf __free(kfree) = NULL;
bool is_tracepoint = false;
- struct module *tp_mod = NULL;
+ struct module *tp_mod __free(module_put) = NULL;
struct tracepoint *tpoint = NULL;
- struct traceprobe_parse_context ctx = {
- .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
- };
if ((argv[0][0] != 'f' && argv[0][0] != 't') || argc < 2)
return -ECANCELED;
@@ -1087,35 +1117,13 @@ static int __trace_fprobe_create(int argc, const char *argv[])
group = TRACEPOINT_EVENT_SYSTEM;
}
- trace_probe_log_init("trace_fprobe", argc, argv);
-
- event = strchr(&argv[0][1], ':');
- if (event)
- event++;
-
- if (isdigit(argv[0][1])) {
- if (event)
- len = event - &argv[0][1] - 1;
- else
- len = strlen(&argv[0][1]);
- if (len > MAX_EVENT_NAME_LEN - 1) {
- trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- memcpy(buf, &argv[0][1], len);
- buf[len] = '\0';
- ret = kstrtouint(buf, 0, &maxactive);
- if (ret || !maxactive) {
+ if (argv[0][1] != '\0') {
+ if (argv[0][1] != ':') {
+ trace_probe_log_set_index(0);
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
- }
- /* fprobe rethook instances are iterated over via a list. The
- * maximum should stay reasonable.
- */
- if (maxactive > RETHOOK_MAXACTIVE_MAX) {
- trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
+ event = &argv[0][2];
}
trace_probe_log_set_index(1);
@@ -1123,20 +1131,14 @@ static int __trace_fprobe_create(int argc, const char *argv[])
/* a symbol(or tracepoint) must be specified */
ret = parse_symbol_and_return(argc, argv, &symbol, &is_return, is_tracepoint);
if (ret < 0)
- goto parse_error;
-
- if (!is_return && maxactive) {
- trace_probe_log_set_index(0);
- trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
- }
+ return -EINVAL;
trace_probe_log_set_index(0);
if (event) {
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return -EINVAL;
}
if (!event) {
@@ -1152,67 +1154,62 @@ static int __trace_fprobe_create(int argc, const char *argv[])
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
else
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
if (is_tracepoint) {
- ctx.flags |= TPARG_FL_TPOINT;
+ ctx->flags |= TPARG_FL_TPOINT;
tpoint = find_tracepoint(symbol, &tp_mod);
if (tpoint) {
- ctx.funcname = kallsyms_lookup(
+ ctx->funcname = kallsyms_lookup(
(unsigned long)tpoint->probestub,
NULL, NULL, NULL, sbuf);
} else if (IS_ENABLED(CONFIG_MODULES)) {
/* This *may* be loaded afterwards */
tpoint = TRACEPOINT_STUB;
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
} else {
trace_probe_log_set_index(1);
trace_probe_log_err(0, NO_TRACEPOINT);
- goto parse_error;
+ return -EINVAL;
}
} else
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
argc -= 2; argv += 2;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
- if (IS_ERR(new_argv)) {
- ret = PTR_ERR(new_argv);
- new_argv = NULL;
- goto out;
- }
+ abuf, MAX_BTF_ARGS_LEN, ctx);
+ if (IS_ERR(new_argv))
+ return PTR_ERR(new_argv);
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
- if (argc > MAX_TRACE_ARGS) {
- ret = -E2BIG;
- goto out;
- }
+ if (argc > MAX_TRACE_ARGS)
+ return -E2BIG;
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
- goto out;
+ return ret;
/* setup a probe */
tf = alloc_trace_fprobe(group, event, symbol, tpoint, tp_mod,
- maxactive, argc, is_return);
+ argc, is_return);
if (IS_ERR(tf)) {
ret = PTR_ERR(tf);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tf is not allocated */
+ return ret;
}
/* parse arguments */
for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tf->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
}
if (is_return && tf->tp.entry_arg) {
@@ -1223,7 +1220,7 @@ static int __trace_fprobe_create(int argc, const char *argv[])
ret = traceprobe_set_print_fmt(&tf->tp,
is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_fprobe(tf);
if (ret) {
@@ -1234,29 +1231,32 @@ static int __trace_fprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return -EINVAL;
}
-out:
- if (tp_mod)
- module_put(tp_mod);
+ /* 'tf' is successfully registered. To avoid freeing, assign NULL. */
+ tf = NULL;
+
+ return 0;
+}
+
+static int trace_fprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context ctx = {
+ .flags = TPARG_FL_KERNEL | TPARG_FL_FPROBE,
+ };
+ int ret;
+
+ trace_probe_log_init("trace_fprobe", argc, argv);
+ ret = trace_fprobe_create_internal(argc, argv, &ctx);
traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
- kfree(dbuf);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_fprobe(tf);
- goto out;
}
static int trace_fprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_fprobe_create);
+ return trace_probe_create(raw_command, trace_fprobe_create_cb);
}
static int trace_fprobe_release(struct dyn_event *ev)
@@ -1278,8 +1278,6 @@ static int trace_fprobe_show(struct seq_file *m, struct dyn_event *ev)
seq_putc(m, 't');
else
seq_putc(m, 'f');
- if (trace_fprobe_is_return(tf) && tf->fp.nr_maxactive)
- seq_printf(m, "%d", tf->fp.nr_maxactive);
seq_printf(m, ":%s/%s", trace_probe_group_name(&tf->tp),
trace_probe_name(&tf->tp));
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 74c353164ca12..df56f9b760109 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -176,7 +176,8 @@ static void function_trace_start(struct trace_array *tr)
tracing_reset_online_cpus(&tr->array_buffer);
}
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS)
static __always_inline unsigned long
function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs)
{
@@ -215,7 +216,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
parent_ip = function_get_true_parent_ip(parent_ip, fregs);
- trace_ctx = tracing_gen_ctx();
+ trace_ctx = tracing_gen_ctx_dec();
data = this_cpu_ptr(tr->array_buffer.data);
if (!atomic_read(&data->disabled))
@@ -320,7 +321,6 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
struct trace_array *tr = op->private;
struct trace_array_cpu *data;
unsigned int trace_ctx;
- unsigned long flags;
int bit;
if (unlikely(!tr->function_enabled))
@@ -346,8 +346,7 @@ function_no_repeats_trace_call(unsigned long ip, unsigned long parent_ip,
if (is_repeat_check(tr, last_info, ip, parent_ip))
goto out;
- local_save_flags(flags);
- trace_ctx = tracing_gen_ctx_flags(flags);
+ trace_ctx = tracing_gen_ctx_dec();
process_repeats(tr, ip, parent_ip, last_info, trace_ctx);
trace_function(tr, ip, parent_ip, trace_ctx);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 5504b5e4e7b41..136c750b0b4da 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -175,16 +175,16 @@ struct fgraph_times {
};
int trace_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
struct fgraph_times *ftimes;
- unsigned long flags;
unsigned int trace_ctx;
long disabled;
- int ret;
+ int ret = 0;
int cpu;
if (*task_var & TRACE_GRAPH_NOTRACE)
@@ -198,7 +198,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
* returning from the function.
*/
if (ftrace_graph_notrace_addr(trace->func)) {
- *task_var |= TRACE_GRAPH_NOTRACE_BIT;
+ *task_var |= TRACE_GRAPH_NOTRACE;
/*
* Need to return 1 to have the return called
* that will clear the NOTRACE bit.
@@ -235,25 +235,21 @@ int trace_graph_entry(struct ftrace_graph_ent *trace,
if (tracing_thresh)
return 1;
- local_irq_save(flags);
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- if (unlikely(IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
- tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR))) {
+ disabled = atomic_read(&data->disabled);
+ if (likely(!disabled)) {
+ trace_ctx = tracing_gen_ctx();
+ if (IS_ENABLED(CONFIG_FUNCTION_GRAPH_RETADDR) &&
+ tracer_flags_is_set(TRACE_GRAPH_PRINT_RETADDR)) {
unsigned long retaddr = ftrace_graph_top_ret_addr(current);
-
ret = __trace_graph_retaddr_entry(tr, trace, trace_ctx, retaddr);
- } else
+ } else {
ret = __trace_graph_entry(tr, trace, trace_ctx);
- } else {
- ret = 0;
+ }
}
-
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ preempt_enable_notrace();
return ret;
}
@@ -270,12 +266,10 @@ __trace_graph_function(struct trace_array *tr,
struct ftrace_graph_ret ret = {
.func = ip,
.depth = 0,
- .calltime = time,
- .rettime = time,
};
__trace_graph_entry(tr, &ent, trace_ctx);
- __trace_graph_return(tr, &ret, trace_ctx);
+ __trace_graph_return(tr, &ret, trace_ctx, time, time);
}
void
@@ -287,8 +281,9 @@ trace_graph_function(struct trace_array *tr,
}
void __trace_graph_return(struct trace_array *tr,
- struct ftrace_graph_ret *trace,
- unsigned int trace_ctx)
+ struct ftrace_graph_ret *trace,
+ unsigned int trace_ctx,
+ u64 calltime, u64 rettime)
{
struct ring_buffer_event *event;
struct trace_buffer *buffer = tr->array_buffer.buffer;
@@ -300,6 +295,8 @@ void __trace_graph_return(struct trace_array *tr,
return;
entry = ring_buffer_event_data(event);
entry->ret = *trace;
+ entry->calltime = calltime;
+ entry->rettime = rettime;
trace_buffer_unlock_commit_nostack(buffer, event);
}
@@ -314,18 +311,20 @@ static void handle_nosleeptime(struct ftrace_graph_ret *trace,
}
void trace_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops, struct ftrace_regs *fregs)
{
unsigned long *task_var = fgraph_get_task_var(gops);
struct trace_array *tr = gops->private;
struct trace_array_cpu *data;
struct fgraph_times *ftimes;
- unsigned long flags;
unsigned int trace_ctx;
+ u64 calltime, rettime;
long disabled;
int size;
int cpu;
+ rettime = trace_clock_local();
+
ftrace_graph_addr_finish(gops, trace);
if (*task_var & TRACE_GRAPH_NOTRACE) {
@@ -339,22 +338,22 @@ void trace_graph_return(struct ftrace_graph_ret *trace,
handle_nosleeptime(trace, ftimes, size);
- trace->calltime = ftimes->calltime;
+ calltime = ftimes->calltime;
- local_irq_save(flags);
+ preempt_disable_notrace();
cpu = raw_smp_processor_id();
data = per_cpu_ptr(tr->array_buffer.data, cpu);
- disabled = atomic_inc_return(&data->disabled);
- if (likely(disabled == 1)) {
- trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
+ disabled = atomic_read(&data->disabled);
+ if (likely(!disabled)) {
+ trace_ctx = tracing_gen_ctx();
+ __trace_graph_return(tr, trace, trace_ctx, calltime, rettime);
}
- atomic_dec(&data->disabled);
- local_irq_restore(flags);
+ preempt_enable_notrace();
}
static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct fgraph_times *ftimes;
int size;
@@ -372,13 +371,11 @@ static void trace_graph_thresh_return(struct ftrace_graph_ret *trace,
handle_nosleeptime(trace, ftimes, size);
- trace->calltime = ftimes->calltime;
-
if (tracing_thresh &&
- (trace->rettime - ftimes->calltime < tracing_thresh))
+ (trace_clock_local() - ftimes->calltime < tracing_thresh))
return;
else
- trace_graph_return(trace, gops);
+ trace_graph_return(trace, gops, fregs);
}
static struct fgraph_ops funcgraph_ops = {
@@ -861,7 +858,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
graph_ret = &ret_entry->ret;
call = &entry->graph_ent;
- duration = graph_ret->rettime - graph_ret->calltime;
+ duration = ret_entry->rettime - ret_entry->calltime;
func = call->func + iter->tr->text_delta;
@@ -1142,11 +1139,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
}
static enum print_line_t
-print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
+print_graph_return(struct ftrace_graph_ret_entry *retentry, struct trace_seq *s,
struct trace_entry *ent, struct trace_iterator *iter,
u32 flags)
{
- unsigned long long duration = trace->rettime - trace->calltime;
+ struct ftrace_graph_ret *trace = &retentry->ret;
+ u64 calltime = retentry->calltime;
+ u64 rettime = retentry->rettime;
+ unsigned long long duration = rettime - calltime;
struct fgraph_data *data = iter->private;
struct trace_array *tr = iter->tr;
unsigned long func;
@@ -1347,7 +1347,7 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
case TRACE_GRAPH_RET: {
struct ftrace_graph_ret_entry *field;
trace_assign_type(field, entry);
- return print_graph_return(&field->ret, s, entry, iter, flags);
+ return print_graph_return(field, s, entry, iter, flags);
}
case TRACE_STACK:
case TRACE_FN:
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index fce064e205706..7294ad676379a 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -176,12 +176,14 @@ static int irqsoff_display_graph(struct trace_array *tr, int set)
}
static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
int ret;
if (ftrace_graph_ignore_func(gops, trace))
@@ -199,6 +201,12 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
if (!func_prolog_dec(tr, &data, &flags))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
trace_ctx = tracing_gen_ctx_flags(flags);
ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
@@ -207,20 +215,29 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace,
}
static void irqsoff_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = irqsoff_trace;
struct trace_array_cpu *data;
unsigned long flags;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_dec(tr, &data, &flags))
return;
+ rettime = trace_clock_local();
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+
trace_ctx = tracing_gen_ctx_flags(flags);
- __trace_graph_return(tr, trace, trace_ctx);
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
atomic_dec(&data->disabled);
}
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 263fac44d3ca3..d8d5f18a141ad 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -8,6 +8,7 @@
#define pr_fmt(fmt) "trace_kprobe: " fmt
#include <linux/bpf-cgroup.h>
+#include <linux/cleanup.h>
#include <linux/security.h>
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -257,6 +258,9 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
}
}
+DEFINE_FREE(free_trace_kprobe, struct trace_kprobe *,
+ if (!IS_ERR_OR_NULL(_T)) free_trace_kprobe(_T))
+
/*
* Allocate new trace_probe and initialize it (including kprobes).
*/
@@ -268,7 +272,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
int maxactive,
int nargs, bool is_return)
{
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret = -ENOMEM;
tk = kzalloc(struct_size(tk, tp.args, nargs), GFP_KERNEL);
@@ -277,12 +281,12 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
tk->nhit = alloc_percpu(unsigned long);
if (!tk->nhit)
- goto error;
+ return ERR_PTR(ret);
if (symbol) {
tk->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tk->symbol)
- goto error;
+ return ERR_PTR(ret);
tk->rp.kp.symbol_name = tk->symbol;
tk->rp.kp.offset = offs;
} else
@@ -299,13 +303,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
ret = trace_probe_init(&tk->tp, event, group, false, nargs);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
dyn_event_init(&tk->devent, &trace_kprobe_ops);
- return tk;
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return_ptr(tk);
}
static struct trace_kprobe *find_trace_kprobe(const char *event,
@@ -634,7 +635,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
struct trace_kprobe *old_tk;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
old_tk = find_trace_kprobe(trace_probe_name(&tk->tp),
trace_probe_group_name(&tk->tp));
@@ -642,11 +643,9 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
if (trace_kprobe_is_return(tk) != trace_kprobe_is_return(old_tk)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_kprobe(tk, old_tk);
+ return -EEXIST;
}
- goto end;
+ return append_trace_kprobe(tk, old_tk);
}
/* Register new event */
@@ -657,7 +656,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
/* Register k*probe */
@@ -672,8 +671,6 @@ static int register_trace_kprobe(struct trace_kprobe *tk)
else
dyn_event_add(&tk->devent, trace_probe_event_call(&tk->tp));
-end:
- mutex_unlock(&event_mutex);
return ret;
}
@@ -706,7 +703,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
return NOTIFY_DONE;
/* Update probes on coming module */
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
if (trace_kprobe_within_module(tk, mod)) {
/* Don't need to check busy - this should have gone. */
@@ -718,14 +715,13 @@ static int trace_kprobe_module_callback(struct notifier_block *nb,
module_name(mod), ret);
}
}
- mutex_unlock(&event_mutex);
return NOTIFY_DONE;
}
static struct notifier_block trace_kprobe_module_nb = {
.notifier_call = trace_kprobe_module_callback,
- .priority = 1 /* Invoked after kprobe module callback */
+ .priority = 2 /* Invoked after kprobe and jump_label module callback */
};
static int trace_kprobe_register_module_notifier(void)
{
@@ -840,7 +836,8 @@ out:
static int trace_kprobe_entry_handler(struct kretprobe_instance *ri,
struct pt_regs *regs);
-static int __trace_kprobe_create(int argc, const char *argv[])
+static int trace_kprobe_create_internal(int argc, const char *argv[],
+ struct traceprobe_parse_context *ctx)
{
/*
* Argument syntax:
@@ -866,11 +863,12 @@ static int __trace_kprobe_create(int argc, const char *argv[])
* Type of args:
* FETCHARG:TYPE : use TYPE instead of unsigned long.
*/
- struct trace_kprobe *tk = NULL;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int i, len, new_argc = 0, ret = 0;
bool is_return = false;
- char *symbol = NULL, *tmp = NULL;
- const char **new_argv = NULL;
+ char *symbol __free(kfree) = NULL;
+ char *tmp = NULL;
+ const char **new_argv __free(kfree) = NULL;
const char *event = NULL, *group = KPROBE_EVENT_SYSTEM;
enum probe_print_type ptype;
int maxactive = 0;
@@ -879,8 +877,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
char buf[MAX_EVENT_NAME_LEN];
char gbuf[MAX_EVENT_NAME_LEN];
char abuf[MAX_BTF_ARGS_LEN];
- char *dbuf = NULL;
- struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ char *dbuf __free(kfree) = NULL;
switch (argv[0][0]) {
case 'r':
@@ -894,8 +891,6 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (argc < 2)
return -ECANCELED;
- trace_probe_log_init("trace_kprobe", argc, argv);
-
event = strchr(&argv[0][1], ':');
if (event)
event++;
@@ -903,7 +898,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (isdigit(argv[0][1])) {
if (!is_return) {
trace_probe_log_err(1, BAD_MAXACT_TYPE);
- goto parse_error;
+ return -EINVAL;
}
if (event)
len = event - &argv[0][1] - 1;
@@ -911,21 +906,21 @@ static int __trace_kprobe_create(int argc, const char *argv[])
len = strlen(&argv[0][1]);
if (len > MAX_EVENT_NAME_LEN - 1) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
memcpy(buf, &argv[0][1], len);
buf[len] = '\0';
ret = kstrtouint(buf, 0, &maxactive);
if (ret || !maxactive) {
trace_probe_log_err(1, BAD_MAXACT);
- goto parse_error;
+ return -EINVAL;
}
/* kretprobes instances are iterated over via a list. The
* maximum should stay reasonable.
*/
if (maxactive > KRETPROBE_MAXACTIVE_MAX) {
trace_probe_log_err(1, MAXACT_TOO_BIG);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -934,10 +929,9 @@ static int __trace_kprobe_create(int argc, const char *argv[])
if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
trace_probe_log_set_index(1);
/* Check whether uprobe event specified */
- if (strchr(argv[1], '/') && strchr(argv[1], ':')) {
- ret = -ECANCELED;
- goto error;
- }
+ if (strchr(argv[1], '/') && strchr(argv[1], ':'))
+ return -ECANCELED;
+
/* a symbol specified */
symbol = kstrdup(argv[1], GFP_KERNEL);
if (!symbol)
@@ -950,7 +944,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
is_return = true;
} else {
trace_probe_log_err(tmp - symbol, BAD_ADDR_SUFFIX);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -958,7 +952,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret || offset < 0 || offset > UINT_MAX) {
trace_probe_log_err(0, BAD_PROBE_ADDR);
- goto parse_error;
+ return -EINVAL;
}
ret = validate_probe_symbol(symbol);
if (ret) {
@@ -966,17 +960,17 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, NON_UNIQ_SYMBOL);
else
trace_probe_log_err(0, BAD_PROBE_ADDR);
- goto parse_error;
+ return -EINVAL;
}
if (is_return)
- ctx.flags |= TPARG_FL_RETURN;
+ ctx->flags |= TPARG_FL_RETURN;
ret = kprobe_on_func_entry(NULL, symbol, offset);
if (ret == 0 && !is_return)
- ctx.flags |= TPARG_FL_FENTRY;
+ ctx->flags |= TPARG_FL_FENTRY;
/* Defer the ENOENT case until register kprobe */
if (ret == -EINVAL && is_return) {
trace_probe_log_err(0, BAD_RETPROBE);
- goto parse_error;
+ return -EINVAL;
}
}
@@ -985,7 +979,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = traceprobe_parse_event_name(&event, &group, gbuf,
event - argv[0]);
if (ret)
- goto parse_error;
+ return ret;
}
if (!event) {
@@ -1001,26 +995,24 @@ static int __trace_kprobe_create(int argc, const char *argv[])
}
argc -= 2; argv += 2;
- ctx.funcname = symbol;
+ ctx->funcname = symbol;
new_argv = traceprobe_expand_meta_args(argc, argv, &new_argc,
- abuf, MAX_BTF_ARGS_LEN, &ctx);
+ abuf, MAX_BTF_ARGS_LEN, ctx);
if (IS_ERR(new_argv)) {
ret = PTR_ERR(new_argv);
new_argv = NULL;
- goto out;
+ return ret;
}
if (new_argv) {
argc = new_argc;
argv = new_argv;
}
- if (argc > MAX_TRACE_ARGS) {
- ret = -E2BIG;
- goto out;
- }
+ if (argc > MAX_TRACE_ARGS)
+ return -E2BIG;
ret = traceprobe_expand_dentry_args(argc, argv, &dbuf);
if (ret)
- goto out;
+ return ret;
/* setup a probe */
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
@@ -1029,16 +1021,16 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ret = PTR_ERR(tk);
/* This must return -ENOMEM, else there is a bug */
WARN_ON_ONCE(ret != -ENOMEM);
- goto out; /* We know tk is not allocated */
+ return ret; /* We know tk is not allocated */
}
/* parse arguments */
for (i = 0; i < argc; i++) {
trace_probe_log_set_index(i + 2);
- ctx.offset = 0;
- ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], &ctx);
+ ctx->offset = 0;
+ ret = traceprobe_parse_probe_arg(&tk->tp, i, argv[i], ctx);
if (ret)
- goto error; /* This can be -ENOMEM */
+ return ret; /* This can be -ENOMEM */
}
/* entry handler for kretprobe */
if (is_return && tk->tp.entry_arg) {
@@ -1049,7 +1041,7 @@ static int __trace_kprobe_create(int argc, const char *argv[])
ptype = is_return ? PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
ret = traceprobe_set_print_fmt(&tk->tp, ptype);
if (ret < 0)
- goto error;
+ return ret;
ret = register_trace_kprobe(tk);
if (ret) {
@@ -1060,27 +1052,34 @@ static int __trace_kprobe_create(int argc, const char *argv[])
trace_probe_log_err(0, BAD_PROBE_ADDR);
else if (ret != -ENOMEM && ret != -EEXIST)
trace_probe_log_err(0, FAIL_REG_PROBE);
- goto error;
+ return ret;
}
+ /*
+ * Here, 'tk' has been registered to the list successfully,
+ * so we don't need to free it.
+ */
+ tk = NULL;
+
+ return 0;
+}
+
+static int trace_kprobe_create_cb(int argc, const char *argv[])
+{
+ struct traceprobe_parse_context ctx = { .flags = TPARG_FL_KERNEL };
+ int ret;
+
+ trace_probe_log_init("trace_kprobe", argc, argv);
+
+ ret = trace_kprobe_create_internal(argc, argv, &ctx);
-out:
traceprobe_finish_parse(&ctx);
trace_probe_log_clear();
- kfree(new_argv);
- kfree(symbol);
- kfree(dbuf);
return ret;
-
-parse_error:
- ret = -EINVAL;
-error:
- free_trace_kprobe(tk);
- goto out;
}
static int trace_kprobe_create(const char *raw_command)
{
- return trace_probe_create(raw_command, __trace_kprobe_create);
+ return trace_probe_create(raw_command, trace_kprobe_create_cb);
}
static int create_or_delete_trace_kprobe(const char *raw_command)
@@ -1896,7 +1895,7 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
bool is_return)
{
enum probe_print_type ptype;
- struct trace_kprobe *tk;
+ struct trace_kprobe *tk __free(free_trace_kprobe) = NULL;
int ret;
char *event;
@@ -1927,19 +1926,14 @@ create_local_trace_kprobe(char *func, void *addr, unsigned long offs,
ptype = trace_kprobe_is_return(tk) ?
PROBE_PRINT_RETURN : PROBE_PRINT_NORMAL;
- if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0) {
- ret = -ENOMEM;
- goto error;
- }
+ if (traceprobe_set_print_fmt(&tk->tp, ptype) < 0)
+ return ERR_PTR(-ENOMEM);
ret = __register_trace_kprobe(tk);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
- return trace_probe_event_call(&tk->tp);
-error:
- free_trace_kprobe(tk);
- return ERR_PTR(ret);
+ return trace_probe_event_call(&(no_free_ptr(tk)->tp));
}
void destroy_local_trace_kprobe(struct trace_event_call *event_call)
@@ -1968,13 +1962,12 @@ static __init void enable_boot_kprobe_events(void)
struct trace_kprobe *tk;
struct dyn_event *pos;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
for_each_trace_kprobe(tk, pos) {
list_for_each_entry(file, &tr->events, list)
if (file->event_call == trace_probe_event_call(&tk->tp))
trace_event_enable_disable(file, 1, 0);
}
- mutex_unlock(&event_mutex);
}
static __init void setup_boot_kprobe_events(void)
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index b9f96c77527db..f3a2722ee4c07 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -1229,6 +1229,8 @@ static void trace_sched_migrate_callback(void *data, struct task_struct *p, int
}
}
+static bool monitor_enabled;
+
static int register_migration_monitor(void)
{
int ret = 0;
@@ -1237,16 +1239,25 @@ static int register_migration_monitor(void)
* Timerlat thread migration check is only required when running timerlat in user-space.
* Thus, enable callback only if timerlat is set with no workload.
*/
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
+ if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options)) {
+ if (WARN_ON_ONCE(monitor_enabled))
+ return 0;
+
ret = register_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!ret)
+ monitor_enabled = true;
+ }
return ret;
}
static void unregister_migration_monitor(void)
{
- if (timerlat_enabled() && !test_bit(OSN_WORKLOAD, &osnoise_options))
- unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ if (!monitor_enabled)
+ return;
+
+ unregister_trace_sched_migrate_task(trace_sched_migrate_callback, NULL);
+ monitor_enabled = false;
}
#else
static int register_migration_monitor(void)
@@ -2083,26 +2094,21 @@ static void osnoise_hotplug_workfn(struct work_struct *dummy)
{
unsigned int cpu = smp_processor_id();
- mutex_lock(&trace_types_lock);
+ guard(mutex)(&trace_types_lock);
if (!osnoise_has_registered_instances())
- goto out_unlock_trace;
+ return;
- mutex_lock(&interface_lock);
- cpus_read_lock();
+ guard(mutex)(&interface_lock);
+ guard(cpus_read_lock)();
if (!cpu_online(cpu))
- goto out_unlock;
+ return;
+
if (!cpumask_test_cpu(cpu, &osnoise_cpumask))
- goto out_unlock;
+ return;
start_kthread(cpu);
-
-out_unlock:
- cpus_read_unlock();
- mutex_unlock(&interface_lock);
-out_unlock_trace:
- mutex_unlock(&trace_types_lock);
}
static DECLARE_WORK(osnoise_hotplug_work, osnoise_hotplug_workfn);
@@ -2300,31 +2306,22 @@ static ssize_t
osnoise_cpus_read(struct file *filp, char __user *ubuf, size_t count,
loff_t *ppos)
{
- char *mask_str;
+ char *mask_str __free(kfree) = NULL;
int len;
- mutex_lock(&interface_lock);
+ guard(mutex)(&interface_lock);
len = snprintf(NULL, 0, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask)) + 1;
mask_str = kmalloc(len, GFP_KERNEL);
- if (!mask_str) {
- count = -ENOMEM;
- goto out_unlock;
- }
+ if (!mask_str)
+ return -ENOMEM;
len = snprintf(mask_str, len, "%*pbl\n", cpumask_pr_args(&osnoise_cpumask));
- if (len >= count) {
- count = -EINVAL;
- goto out_free;
- }
+ if (len >= count)
+ return -EINVAL;
count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
-out_free:
- kfree(mask_str);
-out_unlock:
- mutex_unlock(&interface_lock);
-
return count;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index da748b7cbc4d5..03d56f711ad14 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep);
void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
{
+ struct trace_seq *s = &iter->seq;
va_list ap;
+ if (ignore_event(iter))
+ return;
+
va_start(ap, fmt);
- trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
+ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
va_end(ap);
}
EXPORT_SYMBOL(trace_event_printf);
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 16a5e368e7b77..8f58ee1e8858a 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -1409,7 +1409,7 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
struct traceprobe_parse_context *ctx)
{
struct fetch_insn *code, *tmp = NULL;
- char *type, *arg;
+ char *type, *arg __free(kfree) = NULL;
int ret, len;
len = strlen(argv);
@@ -1426,22 +1426,16 @@ static int traceprobe_parse_probe_arg_body(const char *argv, ssize_t *size,
return -ENOMEM;
parg->comm = kstrdup(arg, GFP_KERNEL);
- if (!parg->comm) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!parg->comm)
+ return -ENOMEM;
type = parse_probe_arg_type(arg, parg, ctx);
- if (IS_ERR(type)) {
- ret = PTR_ERR(type);
- goto out;
- }
+ if (IS_ERR(type))
+ return PTR_ERR(type);
code = tmp = kcalloc(FETCH_INSN_MAX, sizeof(*code), GFP_KERNEL);
- if (!code) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!code)
+ return -ENOMEM;
code[FETCH_INSN_MAX - 1].op = FETCH_OP_END;
ctx->last_type = NULL;
@@ -1497,8 +1491,6 @@ fail:
kfree(code->data);
}
kfree(tmp);
-out:
- kfree(arg);
return ret;
}
@@ -1668,7 +1660,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
{
const struct btf_param *params = NULL;
int i, j, n, used, ret, args_idx = -1;
- const char **new_argv = NULL;
+ const char **new_argv __free(kfree) = NULL;
ret = argv_has_var_arg(argc, argv, &args_idx, ctx);
if (ret < 0)
@@ -1707,7 +1699,7 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
ret = sprint_nth_btf_arg(n, "", buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
@@ -1721,25 +1713,20 @@ const char **traceprobe_expand_meta_args(int argc, const char *argv[],
n = simple_strtoul(argv[i] + 4, &type, 10);
if (type && !(*type == ':' || *type == '\0')) {
trace_probe_log_err(0, BAD_VAR);
- ret = -ENOENT;
- goto error;
+ return ERR_PTR(-ENOENT);
}
/* Note: $argN starts from $arg1 */
ret = sprint_nth_btf_arg(n - 1, type, buf + used,
bufsize - used, ctx);
if (ret < 0)
- goto error;
+ return ERR_PTR(ret);
new_argv[j++] = buf + used;
used += ret + 1;
} else
new_argv[j++] = argv[i];
}
- return new_argv;
-
-error:
- kfree(new_argv);
- return ERR_PTR(ret);
+ return_ptr(new_argv);
}
/* @buf: *buf must be equal to NULL. Caller must to free *buf */
@@ -1747,14 +1734,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
{
int i, used, ret;
const int bufsize = MAX_DENTRY_ARGS_LEN;
- char *tmpbuf = NULL;
+ char *tmpbuf __free(kfree) = NULL;
if (*buf)
return -EINVAL;
used = 0;
for (i = 0; i < argc; i++) {
- char *tmp;
+ char *tmp __free(kfree) = NULL;
char *equal;
size_t arg_len;
@@ -1769,7 +1756,7 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
tmp = kstrdup(argv[i], GFP_KERNEL);
if (!tmp)
- goto nomem;
+ return -ENOMEM;
equal = strchr(tmp, '=');
if (equal)
@@ -1790,18 +1777,14 @@ int traceprobe_expand_dentry_args(int argc, const char *argv[], char **buf)
offsetof(struct file, f_path.dentry),
equal ? equal + 1 : tmp);
- kfree(tmp);
if (ret >= bufsize - used)
- goto nomem;
+ return -ENOMEM;
argv[i] = tmpbuf + used;
used += ret + 1;
}
- *buf = tmpbuf;
+ *buf = no_free_ptr(tmpbuf);
return 0;
-nomem:
- kfree(tmpbuf);
- return -ENOMEM;
}
void traceprobe_finish_parse(struct traceprobe_parse_context *ctx)
diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h
index 2caf0d2afb322..f39b37fcdb3b5 100644
--- a/kernel/trace/trace_probe_tmpl.h
+++ b/kernel/trace/trace_probe_tmpl.h
@@ -232,7 +232,7 @@ array:
/* Sum up total data length for dynamic arrays (strings) */
static nokprobe_inline int
-__get_data_size(struct trace_probe *tp, struct pt_regs *regs, void *edata)
+__get_data_size(struct trace_probe *tp, void *regs, void *edata)
{
struct probe_arg *arg;
int i, len, ret = 0;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 573b5d8e8a28e..cb49f7279dc80 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -442,7 +442,7 @@ int trace_alloc_tgid_map(void)
if (tgid_map)
return 0;
- tgid_map_max = pid_max;
+ tgid_map_max = init_pid_ns.pid_max;
map = kvcalloc(tgid_map_max + 1, sizeof(*tgid_map),
GFP_KERNEL);
if (!map)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index d6c7f18daa15a..af30586f1aeac 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -113,11 +113,13 @@ static int wakeup_display_graph(struct trace_array *tr, int set)
}
static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
int ret = 0;
if (ftrace_graph_ignore_func(gops, trace))
@@ -135,6 +137,12 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return 0;
+ calltime = fgraph_reserve_data(gops->idx, sizeof(*calltime));
+ if (!calltime)
+ return 0;
+
+ *calltime = trace_clock_local();
+
ret = __trace_graph_entry(tr, trace, trace_ctx);
atomic_dec(&data->disabled);
preempt_enable_notrace();
@@ -143,18 +151,28 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace,
}
static void wakeup_graph_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct trace_array *tr = wakeup_trace;
struct trace_array_cpu *data;
unsigned int trace_ctx;
+ u64 *calltime;
+ u64 rettime;
+ int size;
ftrace_graph_addr_finish(gops, trace);
if (!func_prolog_preempt_disable(tr, &data, &trace_ctx))
return;
- __trace_graph_return(tr, trace, trace_ctx);
+ rettime = trace_clock_local();
+
+ calltime = fgraph_retrieve_data(gops->idx, &size);
+ if (!calltime)
+ return;
+
+ __trace_graph_return(tr, trace, trace_ctx, *calltime, rettime);
atomic_dec(&data->disabled);
preempt_enable_notrace();
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 38b5754790c95..d88c44f1dfa55 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -774,7 +774,8 @@ struct fgraph_fixture {
};
static __init int store_entry(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
const char *type = fixture->store_type_name;
@@ -807,7 +808,8 @@ static __init int store_entry(struct ftrace_graph_ent *trace,
}
static __init void store_return(struct ftrace_graph_ret *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
struct fgraph_fixture *fixture = container_of(gops, struct fgraph_fixture, gops);
const char *type = fixture->store_type_name;
@@ -1025,7 +1027,8 @@ static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
- struct fgraph_ops *gops)
+ struct fgraph_ops *gops,
+ struct ftrace_regs *fregs)
{
/* This is harmlessly racy, we want to approximately detect a hang */
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
@@ -1039,7 +1042,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace,
return 0;
}
- return trace_graph_entry(trace, gops);
+ return trace_graph_entry(trace, gops, fregs);
}
static struct fgraph_ops fgraph_ops __initdata = {
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 7f9572a373335..14c6f272c4d8a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -520,20 +520,18 @@ stack_trace_sysctl(const struct ctl_table *table, int write, void *buffer,
int was_enabled;
int ret;
- mutex_lock(&stack_sysctl_mutex);
+ guard(mutex)(&stack_sysctl_mutex);
was_enabled = !!stack_tracer_enabled;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (was_enabled == !!stack_tracer_enabled))
- goto out;
+ return ret;
if (stack_tracer_enabled)
register_ftrace_function(&trace_ops);
else
unregister_ftrace_function(&trace_ops);
- out:
- mutex_unlock(&stack_sysctl_mutex);
return ret;
}
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index bb247beec4470..b3b5586f104de 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -128,7 +128,7 @@ static int stat_seq_init(struct stat_session *session)
int ret = 0;
int i;
- mutex_lock(&session->stat_mutex);
+ guard(mutex)(&session->stat_mutex);
__reset_stat_session(session);
if (!ts->stat_cmp)
@@ -136,11 +136,11 @@ static int stat_seq_init(struct stat_session *session)
stat = ts->stat_start(ts);
if (!stat)
- goto exit;
+ return 0;
ret = insert_stat(root, stat, ts->stat_cmp);
if (ret)
- goto exit;
+ return ret;
/*
* Iterate over the tracer stat entries and store them in an rbtree.
@@ -157,13 +157,10 @@ static int stat_seq_init(struct stat_session *session)
goto exit_free_rbtree;
}
-exit:
- mutex_unlock(&session->stat_mutex);
return ret;
exit_free_rbtree:
__reset_stat_session(session);
- mutex_unlock(&session->stat_mutex);
return ret;
}
@@ -308,7 +305,7 @@ static int init_stat_file(struct stat_session *session)
int register_stat_tracer(struct tracer_stat *trace)
{
struct stat_session *session, *node;
- int ret = -EINVAL;
+ int ret;
if (!trace)
return -EINVAL;
@@ -316,18 +313,18 @@ int register_stat_tracer(struct tracer_stat *trace)
if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
return -EINVAL;
+ guard(mutex)(&all_stat_sessions_mutex);
+
/* Already registered? */
- mutex_lock(&all_stat_sessions_mutex);
list_for_each_entry(node, &all_stat_sessions, session_list) {
if (node->ts == trace)
- goto out;
+ return -EINVAL;
}
- ret = -ENOMEM;
/* Init the session */
session = kzalloc(sizeof(*session), GFP_KERNEL);
if (!session)
- goto out;
+ return -ENOMEM;
session->ts = trace;
INIT_LIST_HEAD(&session->session_list);
@@ -336,16 +333,13 @@ int register_stat_tracer(struct tracer_stat *trace)
ret = init_stat_file(session);
if (ret) {
destroy_session(session);
- goto out;
+ return ret;
}
- ret = 0;
/* Register */
list_add_tail(&session->session_list, &all_stat_sessions);
- out:
- mutex_unlock(&all_stat_sessions_mutex);
- return ret;
+ return 0;
}
void unregister_stat_tracer(struct tracer_stat *trace)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 4875e7f5de3db..ccc762fbb69cd 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -498,11 +498,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
struct trace_uprobe *old_tu;
int ret;
- mutex_lock(&event_mutex);
+ guard(mutex)(&event_mutex);
ret = validate_ref_ctr_offset(tu);
if (ret)
- goto end;
+ return ret;
/* register as an event */
old_tu = find_probe_event(trace_probe_name(&tu->tp),
@@ -511,11 +511,9 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
if (is_ret_probe(tu) != is_ret_probe(old_tu)) {
trace_probe_log_set_index(0);
trace_probe_log_err(0, DIFF_PROBE_TYPE);
- ret = -EEXIST;
- } else {
- ret = append_trace_uprobe(tu, old_tu);
+ return -EEXIST;
}
- goto end;
+ return append_trace_uprobe(tu, old_tu);
}
ret = register_uprobe_event(tu);
@@ -525,14 +523,11 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
trace_probe_log_err(0, EVENT_EXIST);
} else
pr_warn("Failed to register probe event(%d)\n", ret);
- goto end;
+ return ret;
}
dyn_event_add(&tu->devent, trace_probe_event_call(&tu->tp));
-end:
- mutex_unlock(&event_mutex);
-
return ret;
}
diff --git a/kernel/ucount.c b/kernel/ucount.c
index f950b5e59d638..86c5f1c0bad90 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -164,8 +164,8 @@ struct ucounts *get_ucounts(struct ucounts *ucounts)
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{
struct hlist_head *hashent = ucounts_hashentry(ns, uid);
- struct ucounts *ucounts, *new;
bool wrapped;
+ struct ucounts *ucounts, *new = NULL;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -182,17 +182,17 @@ struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
- if (ucounts) {
- kfree(new);
- } else {
+ if (!ucounts) {
hlist_add_head(&new->node, hashent);
get_user_ns(new->ns);
spin_unlock_irq(&ucounts_lock);
return new;
}
}
+
wrapped = !get_ucounts_or_wrap(ucounts);
spin_unlock_irq(&ucounts_lock);
+ kfree(new);
if (wrapped) {
put_ucounts(ucounts);
return NULL;
diff --git a/kernel/umh.c b/kernel/umh.c
index be92342707773..b4da45a3a7cfc 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -544,7 +544,7 @@ static int proc_cap_handler(const struct ctl_table *table, int write,
return 0;
}
-static struct ctl_table usermodehelper_table[] = {
+static const struct ctl_table usermodehelper_table[] = {
{
.procname = "bset",
.data = &usermodehelper_bset,
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 7282f61a8650f..bfbaaecb1dd43 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -75,7 +75,7 @@ static DEFINE_CTL_TABLE_POLL(hostname_poll);
static DEFINE_CTL_TABLE_POLL(domainname_poll);
// Note: update 'enum uts_proc' to match any changes to this table
-static struct ctl_table uts_kern_table[] = {
+static const struct ctl_table uts_kern_table[] = {
{
.procname = "arch",
.data = init_uts_ns.name.machine,
@@ -129,7 +129,7 @@ static struct ctl_table uts_kern_table[] = {
*/
void uts_proc_notify(enum uts_proc proc)
{
- struct ctl_table *table = &uts_kern_table[proc];
+ const struct ctl_table *table = &uts_kern_table[proc];
proc_sys_poll_notify(table->poll);
}
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 1895fbc32bcb9..5267adeaa4034 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -71,7 +71,7 @@ static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe,
bit /= WATCH_QUEUE_NOTE_SIZE;
page = buf->page;
- bit += page->index;
+ bit += page->private;
set_bit(bit, wqueue->notes_bitmap);
generic_pipe_buf_release(pipe, buf);
@@ -278,7 +278,7 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
pages[i] = alloc_page(GFP_KERNEL);
if (!pages[i])
goto error_p;
- pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE;
+ pages[i]->private = i * WATCH_QUEUE_NOTES_PER_PAGE;
}
bitmap = bitmap_alloc(nr_notes, GFP_KERNEL);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 41e0f7e9fa353..b2da7de39d06d 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -190,7 +190,7 @@ void watchdog_hardlockup_check(unsigned int cpu, struct pt_regs *regs)
* with printk_cpu_sync_get_irqsave() that we can still at least
* get the message about the lockup out.
*/
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
+ pr_emerg("CPU%u: Watchdog detected hard LOCKUP on cpu %u\n", this_cpu, cpu);
printk_cpu_sync_get_irqsave(flags);
print_modules();
@@ -1094,7 +1094,7 @@ static int proc_watchdog_cpumask(const struct ctl_table *table, int write,
static const int sixty = 60;
-static struct ctl_table watchdog_sysctls[] = {
+static const struct ctl_table watchdog_sysctls[] = {
{
.procname = "watchdog",
.data = &watchdog_user_enabled,
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 8b07576814a58..97152f2250fe7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2180,7 +2180,7 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
debug_work_activate(work);
/* record the work call stack in order to print it in KASAN reports */
- kasan_record_aux_stack_noalloc(work);
+ kasan_record_aux_stack(work);
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
@@ -2508,6 +2508,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
return;
}
+ WARN_ON_ONCE(cpu != WORK_CPU_UNBOUND && !cpu_online(cpu));
dwork->wq = wq;
dwork->cpu = cpu;
timer->expires = jiffies + delay;
@@ -2533,6 +2534,12 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
* @dwork: work to queue
* @delay: number of jiffies to wait before queueing
*
+ * We queue the delayed_work to a specific CPU, for non-zero delays the
+ * caller must ensure it is online and can't go away. Callers that fail
+ * to ensure this, may get @dwork->timer queued to an offlined CPU and
+ * this will prevent queueing of @dwork->work unless the offlined CPU
+ * becomes online again.
+ *
* Return: %false if @work was already on a queue, %true otherwise. If
* @delay is zero and @dwork is idle, it will be scheduled for immediate
* execution.
@@ -3510,12 +3517,6 @@ repeat:
}
/*
- * Put the reference grabbed by send_mayday(). @pool won't
- * go away while we're still attached to it.
- */
- put_pwq(pwq);
-
- /*
* Leave this pool. Notify regular workers; otherwise, we end up
* with 0 concurrency and stalling the execution.
*/
@@ -3525,6 +3526,12 @@ repeat:
worker_detach_from_pool(rescuer);
+ /*
+ * Put the reference grabbed by send_mayday(). @pool might
+ * go away any time after it.
+ */
+ put_pwq_unlocked(pwq);
+
raw_spin_lock_irq(&wq_mayday_lock);
}
@@ -3680,23 +3687,27 @@ void workqueue_softirq_dead(unsigned int cpu)
* check_flush_dependency - check for flush dependency sanity
* @target_wq: workqueue being flushed
* @target_work: work item being flushed (NULL for workqueue flushes)
+ * @from_cancel: are we called from the work cancel path
*
* %current is trying to flush the whole @target_wq or @target_work on it.
- * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not
- * reclaiming memory or running on a workqueue which doesn't have
- * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to
- * a deadlock.
+ * If this is not the cancel path (which implies work being flushed is either
+ * already running, or will not be at all), check if @target_wq doesn't have
+ * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running
+ * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward-
+ * progress guarantee leading to a deadlock.
*/
static void check_flush_dependency(struct workqueue_struct *target_wq,
- struct work_struct *target_work)
+ struct work_struct *target_work,
+ bool from_cancel)
{
- work_func_t target_func = target_work ? target_work->func : NULL;
+ work_func_t target_func;
struct worker *worker;
- if (target_wq->flags & WQ_MEM_RECLAIM)
+ if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM)
return;
worker = current_wq_worker();
+ target_func = target_work ? target_work->func : NULL;
WARN_ONCE(current->flags & PF_MEMALLOC,
"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps",
@@ -3980,7 +3991,7 @@ void __flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
- check_flush_dependency(wq, NULL);
+ check_flush_dependency(wq, NULL, false);
mutex_unlock(&wq->mutex);
@@ -4155,7 +4166,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
}
wq = pwq->wq;
- check_flush_dependency(wq, work);
+ check_flush_dependency(wq, work, from_cancel);
insert_wq_barrier(pwq, barr, work, worker);
raw_spin_unlock_irq(&pool->lock);
@@ -5641,6 +5652,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq)
} while (activated);
}
+__printf(1, 0)
static struct workqueue_struct *__alloc_workqueue(const char *fmt,
unsigned int flags,
int max_active, va_list args)
@@ -7828,7 +7840,7 @@ static void __init wq_cpu_intensive_thresh_init(void)
unsigned long thresh;
unsigned long bogo;
- pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release");
+ pwq_release_worker = kthread_run_worker(0, "pool_workqueue_release");
BUG_ON(IS_ERR(pwq_release_worker));
/* if the user set it to a specific value, keep it */