diff options
| author | Ingo Molnar <mingo@kernel.org> | 2016-07-07 10:35:28 +0200 | 
|---|---|---|
| committer | Ingo Molnar <mingo@kernel.org> | 2016-07-07 10:35:28 +0200 | 
| commit | 4b4b20852d1009c5e8bc357b22353b62e3a241c7 (patch) | |
| tree | 1026418471fe10c5b9f2fdff8a6b49bf070938fc /kernel | |
| parent | 5130213721d01b6632c255d4295a8102cbb58379 (diff) | |
| parent | f00c0afdfa625165a609513bc74164d56752ec3e (diff) | |
Merge branch 'timers/fast-wheel' into timers/core
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/audit.c | 17 | ||||
| -rw-r--r-- | kernel/audit.h | 4 | ||||
| -rw-r--r-- | kernel/auditsc.c | 8 | ||||
| -rw-r--r-- | kernel/bpf/verifier.c | 41 | ||||
| -rw-r--r-- | kernel/cgroup.c | 148 | ||||
| -rw-r--r-- | kernel/events/core.c | 8 | ||||
| -rw-r--r-- | kernel/fork.c | 50 | ||||
| -rw-r--r-- | kernel/futex.c | 14 | ||||
| -rw-r--r-- | kernel/jump_label.c | 36 | ||||
| -rw-r--r-- | kernel/kcov.c | 7 | ||||
| -rw-r--r-- | kernel/locking/mutex-debug.c | 12 | ||||
| -rw-r--r-- | kernel/locking/mutex-debug.h | 4 | ||||
| -rw-r--r-- | kernel/locking/mutex.c | 15 | ||||
| -rw-r--r-- | kernel/locking/mutex.h | 2 | ||||
| -rw-r--r-- | kernel/locking/qspinlock.c | 60 | ||||
| -rw-r--r-- | kernel/power/process.c | 12 | ||||
| -rw-r--r-- | kernel/relay.c | 1 | ||||
| -rw-r--r-- | kernel/sched/core.c | 42 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 15 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 72 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 2 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 2 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 3 | ||||
| -rw-r--r-- | kernel/signal.c | 24 | ||||
| -rw-r--r-- | kernel/time/tick-internal.h | 1 | ||||
| -rw-r--r-- | kernel/time/tick-sched.c | 45 | ||||
| -rw-r--r-- | kernel/time/timer.c | 1103 | ||||
| -rw-r--r-- | kernel/trace/bpf_trace.c | 14 | ||||
| -rw-r--r-- | kernel/trace/trace_printk.c | 7 | 
29 files changed, 1077 insertions, 692 deletions
| diff --git a/kernel/audit.c b/kernel/audit.c index 22bb4f24f071..8d528f9930da 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1883,6 +1883,23 @@ out_null:  	audit_log_format(ab, " exe=(null)");  } +struct tty_struct *audit_get_tty(struct task_struct *tsk) +{ +	struct tty_struct *tty = NULL; +	unsigned long flags; + +	spin_lock_irqsave(&tsk->sighand->siglock, flags); +	if (tsk->signal) +		tty = tty_kref_get(tsk->signal->tty); +	spin_unlock_irqrestore(&tsk->sighand->siglock, flags); +	return tty; +} + +void audit_put_tty(struct tty_struct *tty) +{ +	tty_kref_put(tty); +} +  void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)  {  	const struct cred *cred; diff --git a/kernel/audit.h b/kernel/audit.h index cbbe6bb6496e..a492f4c4e710 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -23,6 +23,7 @@  #include <linux/audit.h>  #include <linux/skbuff.h>  #include <uapi/linux/mqueue.h> +#include <linux/tty.h>  /* AUDIT_NAMES is the number of slots we reserve in the audit_context   * for saving names from getname().  If we get more names we will allocate @@ -262,6 +263,9 @@ extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);  extern void audit_log_d_path_exe(struct audit_buffer *ab,  				 struct mm_struct *mm); +extern struct tty_struct *audit_get_tty(struct task_struct *tsk); +extern void audit_put_tty(struct tty_struct *tty); +  /* audit watch functions */  #ifdef CONFIG_AUDIT_WATCH  extern void audit_put_watch(struct audit_watch *watch); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 62ab53d7619c..2672d105cffc 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -63,7 +63,6 @@  #include <asm/unistd.h>  #include <linux/security.h>  #include <linux/list.h> -#include <linux/tty.h>  #include <linux/binfmts.h>  #include <linux/highmem.h>  #include <linux/syscalls.h> @@ -1985,14 +1984,15 @@ static void audit_log_set_loginuid(kuid_t koldloginuid, kuid_t kloginuid,  	if (!audit_enabled)  		return; +	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); +	if (!ab) +		return; +  	uid = from_kuid(&init_user_ns, task_uid(current));  	oldloginuid = from_kuid(&init_user_ns, koldloginuid);  	loginuid = from_kuid(&init_user_ns, kloginuid),  	tty = audit_get_tty(current); -	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); -	if (!ab) -		return;  	audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);  	audit_log_task_context(ab);  	audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d", diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 668e07903c8f..eec9f90ba030 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -126,31 +126,6 @@   * are set to NOT_INIT to indicate that they are no longer readable.   */ -/* types of values stored in eBPF registers */ -enum bpf_reg_type { -	NOT_INIT = 0,		 /* nothing was written into register */ -	UNKNOWN_VALUE,		 /* reg doesn't contain a valid pointer */ -	PTR_TO_CTX,		 /* reg points to bpf_context */ -	CONST_PTR_TO_MAP,	 /* reg points to struct bpf_map */ -	PTR_TO_MAP_VALUE,	 /* reg points to map element value */ -	PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ -	FRAME_PTR,		 /* reg == frame_pointer */ -	PTR_TO_STACK,		 /* reg == frame_pointer + imm */ -	CONST_IMM,		 /* constant integer value */ - -	/* PTR_TO_PACKET represents: -	 * skb->data -	 * skb->data + imm -	 * skb->data + (u16) var -	 * skb->data + (u16) var + imm -	 * if (range > 0) then [ptr, ptr + range - off) is safe to access -	 * if (id > 0) means that some 'var' was added -	 * if (off > 0) menas that 'imm' was added -	 */ -	PTR_TO_PACKET, -	PTR_TO_PACKET_END,	 /* skb->data + headlen */ -}; -  struct reg_state {  	enum bpf_reg_type type;  	union { @@ -695,10 +670,10 @@ static int check_packet_access(struct verifier_env *env, u32 regno, int off,  /* check access to 'struct bpf_context' fields */  static int check_ctx_access(struct verifier_env *env, int off, int size, -			    enum bpf_access_type t) +			    enum bpf_access_type t, enum bpf_reg_type *reg_type)  {  	if (env->prog->aux->ops->is_valid_access && -	    env->prog->aux->ops->is_valid_access(off, size, t)) { +	    env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {  		/* remember the offset of last byte accessed in ctx */  		if (env->prog->aux->max_ctx_offset < off + size)  			env->prog->aux->max_ctx_offset = off + size; @@ -798,21 +773,19 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,  			mark_reg_unknown_value(state->regs, value_regno);  	} else if (reg->type == PTR_TO_CTX) { +		enum bpf_reg_type reg_type = UNKNOWN_VALUE; +  		if (t == BPF_WRITE && value_regno >= 0 &&  		    is_pointer_value(env, value_regno)) {  			verbose("R%d leaks addr into ctx\n", value_regno);  			return -EACCES;  		} -		err = check_ctx_access(env, off, size, t); +		err = check_ctx_access(env, off, size, t, ®_type);  		if (!err && t == BPF_READ && value_regno >= 0) {  			mark_reg_unknown_value(state->regs, value_regno); -			if (off == offsetof(struct __sk_buff, data) && -			    env->allow_ptr_leaks) +			if (env->allow_ptr_leaks)  				/* note that reg.[id|off|range] == 0 */ -				state->regs[value_regno].type = PTR_TO_PACKET; -			else if (off == offsetof(struct __sk_buff, data_end) && -				 env->allow_ptr_leaks) -				state->regs[value_regno].type = PTR_TO_PACKET_END; +				state->regs[value_regno].type = reg_type;  		}  	} else if (reg->type == FRAME_PTR || reg->type == PTR_TO_STACK) { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 86cb5c6e8932..75c0ff00aca6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -837,6 +837,8 @@ static void put_css_set_locked(struct css_set *cset)  static void put_css_set(struct css_set *cset)  { +	unsigned long flags; +  	/*  	 * Ensure that the refcount doesn't hit zero while any readers  	 * can see it. Similar to atomic_dec_and_lock(), but for an @@ -845,9 +847,9 @@ static void put_css_set(struct css_set *cset)  	if (atomic_add_unless(&cset->refcount, -1, 1))  		return; -	spin_lock_bh(&css_set_lock); +	spin_lock_irqsave(&css_set_lock, flags);  	put_css_set_locked(cset); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irqrestore(&css_set_lock, flags);  }  /* @@ -1070,11 +1072,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	/* First see if we already have a cgroup group that matches  	 * the desired set */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	cset = find_existing_css_set(old_cset, cgrp, template);  	if (cset)  		get_css_set(cset); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (cset)  		return cset; @@ -1102,7 +1104,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,  	 * find_existing_css_set() */  	memcpy(cset->subsys, template, sizeof(cset->subsys)); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	/* Add reference counts and links from the new css_set. */  	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {  		struct cgroup *c = link->cgrp; @@ -1128,7 +1130,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,  		css_get(css);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return cset;  } @@ -1192,7 +1194,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)  	 * Release all the links from cset_links to this hierarchy's  	 * root cgroup  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {  		list_del(&link->cset_link); @@ -1200,7 +1202,7 @@ static void cgroup_destroy_root(struct cgroup_root *root)  		kfree(link);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (!list_empty(&root->root_list)) {  		list_del(&root->root_list); @@ -1600,11 +1602,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)  		ss->root = dst_root;  		css->cgroup = dcgrp; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		hash_for_each(css_set_table, i, cset, hlist)  			list_move_tail(&cset->e_cset_node[ss->id],  				       &dcgrp->e_csets[ss->id]); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		/* default hierarchy doesn't enable controllers by default */  		dst_root->subsys_mask |= 1 << ssid; @@ -1640,10 +1642,10 @@ static int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,  	if (!buf)  		return -ENOMEM; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	ns_cgroup = current_cgns_cgroup_from_root(kf_cgroot);  	len = kernfs_path_from_node(kf_node, ns_cgroup->kn, buf, PATH_MAX); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (len >= PATH_MAX)  		len = -ERANGE; @@ -1897,7 +1899,7 @@ static void cgroup_enable_task_cg_lists(void)  {  	struct task_struct *p, *g; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	if (use_task_css_set_links)  		goto out_unlock; @@ -1922,8 +1924,12 @@ static void cgroup_enable_task_cg_lists(void)  		 * entry won't be deleted though the process has exited.  		 * Do it while holding siglock so that we don't end up  		 * racing against cgroup_exit(). +		 * +		 * Interrupts were already disabled while acquiring +		 * the css_set_lock, so we do not need to disable it +		 * again when acquiring the sighand->siglock here.  		 */ -		spin_lock_irq(&p->sighand->siglock); +		spin_lock(&p->sighand->siglock);  		if (!(p->flags & PF_EXITING)) {  			struct css_set *cset = task_css_set(p); @@ -1932,11 +1938,11 @@ static void cgroup_enable_task_cg_lists(void)  			list_add_tail(&p->cg_list, &cset->tasks);  			get_css_set(cset);  		} -		spin_unlock_irq(&p->sighand->siglock); +		spin_unlock(&p->sighand->siglock);  	} while_each_thread(g, p);  	read_unlock(&tasklist_lock);  out_unlock: -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  }  static void init_cgroup_housekeeping(struct cgroup *cgrp) @@ -2043,13 +2049,13 @@ static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)  	 * Link the root cgroup in this hierarchy into all the css_set  	 * objects.  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	hash_for_each(css_set_table, i, cset, hlist) {  		link_css_set(&tmp_links, cset, root_cgrp);  		if (css_set_populated(cset))  			cgroup_update_populated(root_cgrp, true);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	BUG_ON(!list_empty(&root_cgrp->self.children));  	BUG_ON(atomic_read(&root->nr_cgrps) != 1); @@ -2256,11 +2262,11 @@ out_mount:  		struct cgroup *cgrp;  		mutex_lock(&cgroup_mutex); -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		cgrp = cset_cgroup_from_root(ns->root_cset, root); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		mutex_unlock(&cgroup_mutex);  		nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); @@ -2337,11 +2343,11 @@ char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,  	char *ret;  	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	mutex_unlock(&cgroup_mutex);  	return ret; @@ -2369,7 +2375,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)  	char *path = NULL;  	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id); @@ -2382,7 +2388,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)  			path = buf;  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	mutex_unlock(&cgroup_mutex);  	return path;  } @@ -2557,7 +2563,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,  	 * the new cgroup.  There are no failure cases after here, so this  	 * is the commit point.  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(cset, &tset->src_csets, mg_node) {  		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {  			struct css_set *from_cset = task_css_set(task); @@ -2568,7 +2574,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,  			put_css_set_locked(from_cset);  		}  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/*  	 * Migration is committed, all target tasks are now on dst_csets. @@ -2597,13 +2603,13 @@ out_cancel_attach:  		}  	} while_each_subsys_mask();  out_release_tset: -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_splice_init(&tset->dst_csets, &tset->src_csets);  	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {  		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);  		list_del_init(&cset->mg_node);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return ret;  } @@ -2634,7 +2640,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)  	lockdep_assert_held(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {  		cset->mg_src_cgrp = NULL;  		cset->mg_dst_cgrp = NULL; @@ -2642,7 +2648,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)  		list_del_init(&cset->mg_preload_node);  		put_css_set_locked(cset);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  }  /** @@ -2783,7 +2789,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,  	 * already PF_EXITING could be freed from underneath us unless we  	 * take an rcu_read_lock.  	 */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	rcu_read_lock();  	task = leader;  	do { @@ -2792,7 +2798,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,  			break;  	} while_each_thread(leader, task);  	rcu_read_unlock(); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return cgroup_taskset_migrate(&tset, root);  } @@ -2816,7 +2822,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,  		return -EBUSY;  	/* look up all src csets */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	rcu_read_lock();  	task = leader;  	do { @@ -2826,7 +2832,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,  			break;  	} while_each_thread(leader, task);  	rcu_read_unlock(); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/* prepare dst csets and commit */  	ret = cgroup_migrate_prepare_dst(&preloaded_csets); @@ -2859,9 +2865,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,  		struct cgroup *cgrp;  		struct inode *inode; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		while (!cgroup_is_descendant(dst_cgrp, cgrp))  			cgrp = cgroup_parent(cgrp); @@ -2962,9 +2968,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)  		if (root == &cgrp_dfl_root)  			continue; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		from_cgrp = task_cgroup_from_root(from, root); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  		retval = cgroup_attach_task(from_cgrp, tsk, false);  		if (retval) @@ -3080,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  	percpu_down_write(&cgroup_threadgroup_rwsem);  	/* look up all csses currently attached to @cgrp's subtree */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {  		struct cgrp_cset_link *link; @@ -3088,14 +3094,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  			cgroup_migrate_add_src(link->cset, dsct,  					       &preloaded_csets);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/* NULL dst indicates self on default hierarchy */  	ret = cgroup_migrate_prepare_dst(&preloaded_csets);  	if (ret)  		goto out_finish; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {  		struct task_struct *task, *ntask; @@ -3107,7 +3113,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)  		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)  			cgroup_taskset_add(task, &tset);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	ret = cgroup_taskset_migrate(&tset, cgrp->root);  out_finish: @@ -3908,10 +3914,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)  	int count = 0;  	struct cgrp_cset_link *link; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &cgrp->cset_links, cset_link)  		count += atomic_read(&link->cset->refcount); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return count;  } @@ -4249,7 +4255,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,  	memset(it, 0, sizeof(*it)); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	it->ss = css->ss; @@ -4262,7 +4268,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,  	css_task_iter_advance_css_set(it); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  }  /** @@ -4280,7 +4286,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)  		it->cur_task = NULL;  	} -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	if (it->task_pos) {  		it->cur_task = list_entry(it->task_pos, struct task_struct, @@ -4289,7 +4295,7 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)  		css_task_iter_advance(it);  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return it->cur_task;  } @@ -4303,10 +4309,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)  void css_task_iter_end(struct css_task_iter *it)  {  	if (it->cur_cset) { -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		list_del(&it->iters_node);  		put_css_set_locked(it->cur_cset); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  	}  	if (it->cur_task) @@ -4338,10 +4344,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)  	mutex_lock(&cgroup_mutex);  	/* all tasks in @from are being moved, all csets are source */ -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &from->cset_links, cset_link)  		cgroup_migrate_add_src(link->cset, to, &preloaded_csets); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	ret = cgroup_migrate_prepare_dst(&preloaded_csets);  	if (ret) @@ -5063,6 +5069,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,  	memset(css, 0, sizeof(*css));  	css->cgroup = cgrp;  	css->ss = ss; +	css->id = -1;  	INIT_LIST_HEAD(&css->sibling);  	INIT_LIST_HEAD(&css->children);  	css->serial_nr = css_serial_nr_next++; @@ -5150,7 +5157,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,  	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);  	if (err < 0) -		goto err_free_percpu_ref; +		goto err_free_css;  	css->id = err;  	/* @css is ready to be brought online now, make it visible */ @@ -5174,9 +5181,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,  err_list_del:  	list_del_rcu(&css->sibling); -	cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: -	percpu_ref_exit(&css->refcnt);  err_free_css:  	call_rcu(&css->rcu_head, css_free_rcu_fn);  	return ERR_PTR(err); @@ -5451,10 +5455,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)  	 */  	cgrp->self.flags &= ~CSS_ONLINE; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &cgrp->cset_links, cset_link)  		link->cset->dead = true; -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	/* initiate massacre of all css's */  	for_each_css(css, ssid, cgrp) @@ -5725,7 +5729,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,  		goto out;  	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	for_each_root(root) {  		struct cgroup_subsys *ss; @@ -5778,7 +5782,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,  	retval = 0;  out_unlock: -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	mutex_unlock(&cgroup_mutex);  	kfree(buf);  out: @@ -5923,13 +5927,13 @@ void cgroup_post_fork(struct task_struct *child)  	if (use_task_css_set_links) {  		struct css_set *cset; -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		cset = task_css_set(current);  		if (list_empty(&child->cg_list)) {  			get_css_set(cset);  			css_set_move_task(child, NULL, cset, false);  		} -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  	}  	/* @@ -5974,9 +5978,9 @@ void cgroup_exit(struct task_struct *tsk)  	cset = task_css_set(tsk);  	if (!list_empty(&tsk->cg_list)) { -		spin_lock_bh(&css_set_lock); +		spin_lock_irq(&css_set_lock);  		css_set_move_task(tsk, cset, NULL, false); -		spin_unlock_bh(&css_set_lock); +		spin_unlock_irq(&css_set_lock);  	} else {  		get_css_set(cset);  	} @@ -6044,9 +6048,9 @@ static void cgroup_release_agent(struct work_struct *work)  	if (!pathbuf || !agentbuf)  		goto out; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	if (!path)  		goto out; @@ -6306,12 +6310,12 @@ struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,  		return ERR_PTR(-EPERM);  	mutex_lock(&cgroup_mutex); -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	cset = task_css_set(current);  	get_css_set(cset); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	mutex_unlock(&cgroup_mutex);  	new_ns = alloc_cgroup_ns(); @@ -6435,7 +6439,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)  	if (!name_buf)  		return -ENOMEM; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	rcu_read_lock();  	cset = rcu_dereference(current->cgroups);  	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { @@ -6446,7 +6450,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)  			   c->root->hierarchy_id, name_buf);  	}  	rcu_read_unlock(); -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	kfree(name_buf);  	return 0;  } @@ -6457,7 +6461,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)  	struct cgroup_subsys_state *css = seq_css(seq);  	struct cgrp_cset_link *link; -	spin_lock_bh(&css_set_lock); +	spin_lock_irq(&css_set_lock);  	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {  		struct css_set *cset = link->cset;  		struct task_struct *task; @@ -6480,7 +6484,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)  	overflow:  		seq_puts(seq, "  ...\n");  	} -	spin_unlock_bh(&css_set_lock); +	spin_unlock_irq(&css_set_lock);  	return 0;  } diff --git a/kernel/events/core.c b/kernel/events/core.c index 274450efea90..85cd41878a74 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3862,10 +3862,8 @@ static void _free_event(struct perf_event *event)  	if (event->ctx)  		put_ctx(event->ctx); -	if (event->pmu) { -		exclusive_event_destroy(event); -		module_put(event->pmu->module); -	} +	exclusive_event_destroy(event); +	module_put(event->pmu->module);  	call_rcu(&event->rcu_head, free_event_rcu);  } @@ -7531,7 +7529,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)  	prog = event->tp_event->prog;  	if (prog) {  		event->tp_event->prog = NULL; -		bpf_prog_put(prog); +		bpf_prog_put_rcu(prog);  	}  } diff --git a/kernel/fork.c b/kernel/fork.c index 5c2c355aa97f..4a7ec0c6c88c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -148,18 +148,18 @@ static inline void free_task_struct(struct task_struct *tsk)  }  #endif -void __weak arch_release_thread_info(struct thread_info *ti) +void __weak arch_release_thread_stack(unsigned long *stack)  {  } -#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR +#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR  /*   * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a   * kmemcache based allocator.   */  # if THREAD_SIZE >= PAGE_SIZE -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,  						  int node)  {  	struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, @@ -172,33 +172,33 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,  	return page ? page_address(page) : NULL;  } -static inline void free_thread_info(struct thread_info *ti) +static inline void free_thread_stack(unsigned long *stack)  { -	struct page *page = virt_to_page(ti); +	struct page *page = virt_to_page(stack);  	memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK,  				    -(1 << THREAD_SIZE_ORDER));  	__free_kmem_pages(page, THREAD_SIZE_ORDER);  }  # else -static struct kmem_cache *thread_info_cache; +static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,  						  int node)  { -	return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node); +	return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);  } -static void free_thread_info(struct thread_info *ti) +static void free_thread_stack(unsigned long *stack)  { -	kmem_cache_free(thread_info_cache, ti); +	kmem_cache_free(thread_stack_cache, stack);  } -void thread_info_cache_init(void) +void thread_stack_cache_init(void)  { -	thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE, +	thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,  					      THREAD_SIZE, 0, NULL); -	BUG_ON(thread_info_cache == NULL); +	BUG_ON(thread_stack_cache == NULL);  }  # endif  #endif @@ -221,9 +221,9 @@ struct kmem_cache *vm_area_cachep;  /* SLAB cache for mm_struct structures (tsk->mm) */  static struct kmem_cache *mm_cachep; -static void account_kernel_stack(struct thread_info *ti, int account) +static void account_kernel_stack(unsigned long *stack, int account)  { -	struct zone *zone = page_zone(virt_to_page(ti)); +	struct zone *zone = page_zone(virt_to_page(stack));  	mod_zone_page_state(zone, NR_KERNEL_STACK, account);  } @@ -231,8 +231,8 @@ static void account_kernel_stack(struct thread_info *ti, int account)  void free_task(struct task_struct *tsk)  {  	account_kernel_stack(tsk->stack, -1); -	arch_release_thread_info(tsk->stack); -	free_thread_info(tsk->stack); +	arch_release_thread_stack(tsk->stack); +	free_thread_stack(tsk->stack);  	rt_mutex_debug_task_free(tsk);  	ftrace_graph_exit_task(tsk);  	put_seccomp_filter(tsk); @@ -343,7 +343,7 @@ void set_task_stack_end_magic(struct task_struct *tsk)  static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  {  	struct task_struct *tsk; -	struct thread_info *ti; +	unsigned long *stack;  	int err;  	if (node == NUMA_NO_NODE) @@ -352,15 +352,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	if (!tsk)  		return NULL; -	ti = alloc_thread_info_node(tsk, node); -	if (!ti) +	stack = alloc_thread_stack_node(tsk, node); +	if (!stack)  		goto free_tsk;  	err = arch_dup_task_struct(tsk, orig);  	if (err) -		goto free_ti; +		goto free_stack; -	tsk->stack = ti; +	tsk->stack = stack;  #ifdef CONFIG_SECCOMP  	/*  	 * We must handle setting up seccomp filters once we're under @@ -392,14 +392,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)  	tsk->task_frag.page = NULL;  	tsk->wake_q.next = NULL; -	account_kernel_stack(ti, 1); +	account_kernel_stack(stack, 1);  	kcov_task_init(tsk);  	return tsk; -free_ti: -	free_thread_info(ti); +free_stack: +	free_thread_stack(stack);  free_tsk:  	free_task_struct(tsk);  	return NULL; diff --git a/kernel/futex.c b/kernel/futex.c index ee25f5ba4aca..33664f70e2d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -469,7 +469,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)  {  	unsigned long address = (unsigned long)uaddr;  	struct mm_struct *mm = current->mm; -	struct page *page; +	struct page *page, *tail;  	struct address_space *mapping;  	int err, ro = 0; @@ -530,7 +530,15 @@ again:  	 * considered here and page lock forces unnecessarily serialization  	 * From this point on, mapping will be re-verified if necessary and  	 * page lock will be acquired only if it is unavoidable -	 */ +	 * +	 * Mapping checks require the head page for any compound page so the +	 * head page and mapping is looked up now. For anonymous pages, it +	 * does not matter if the page splits in the future as the key is +	 * based on the address. For filesystem-backed pages, the tail is +	 * required as the index of the page determines the key. For +	 * base pages, there is no tail page and tail == page. +	 */ +	tail = page;  	page = compound_head(page);  	mapping = READ_ONCE(page->mapping); @@ -654,7 +662,7 @@ again:  		key->both.offset |= FUT_OFF_INODE; /* inode-based key */  		key->shared.inode = inode; -		key->shared.pgoff = basepage_index(page); +		key->shared.pgoff = basepage_index(tail);  		rcu_read_unlock();  	} diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 05254eeb4b4e..4b353e0be121 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -58,13 +58,36 @@ static void jump_label_update(struct static_key *key);  void static_key_slow_inc(struct static_key *key)  { +	int v, v1; +  	STATIC_KEY_CHECK_USE(); -	if (atomic_inc_not_zero(&key->enabled)) -		return; + +	/* +	 * Careful if we get concurrent static_key_slow_inc() calls; +	 * later calls must wait for the first one to _finish_ the +	 * jump_label_update() process.  At the same time, however, +	 * the jump_label_update() call below wants to see +	 * static_key_enabled(&key) for jumps to be updated properly. +	 * +	 * So give a special meaning to negative key->enabled: it sends +	 * static_key_slow_inc() down the slow path, and it is non-zero +	 * so it counts as "enabled" in jump_label_update().  Note that +	 * atomic_inc_unless_negative() checks >= 0, so roll our own. +	 */ +	for (v = atomic_read(&key->enabled); v > 0; v = v1) { +		v1 = atomic_cmpxchg(&key->enabled, v, v + 1); +		if (likely(v1 == v)) +			return; +	}  	jump_label_lock(); -	if (atomic_inc_return(&key->enabled) == 1) +	if (atomic_read(&key->enabled) == 0) { +		atomic_set(&key->enabled, -1);  		jump_label_update(key); +		atomic_set(&key->enabled, 1); +	} else { +		atomic_inc(&key->enabled); +	}  	jump_label_unlock();  }  EXPORT_SYMBOL_GPL(static_key_slow_inc); @@ -72,6 +95,13 @@ EXPORT_SYMBOL_GPL(static_key_slow_inc);  static void __static_key_slow_dec(struct static_key *key,  		unsigned long rate_limit, struct delayed_work *work)  { +	/* +	 * The negative count check is valid even when a negative +	 * key->enabled is in use by static_key_slow_inc(); a +	 * __static_key_slow_dec() before the first static_key_slow_inc() +	 * returns is unbalanced, because all other static_key_slow_inc() +	 * instances block while the update is in progress. +	 */  	if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) {  		WARN(atomic_read(&key->enabled) < 0,  		     "jump label: negative count!\n"); diff --git a/kernel/kcov.c b/kernel/kcov.c index a02f2dddd1d7..8d44b3fea9d0 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -264,7 +264,12 @@ static const struct file_operations kcov_fops = {  static int __init kcov_init(void)  { -	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { +	/* +	 * The kcov debugfs file won't ever get removed and thus, +	 * there is no need to protect it against removal races. The +	 * use of debugfs_create_file_unsafe() is actually safe here. +	 */ +	if (!debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops)) {  		pr_err("failed to create kcov in debugfs\n");  		return -ENOMEM;  	} diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 3ef3736002d8..9c951fade415 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -49,21 +49,21 @@ void debug_mutex_free_waiter(struct mutex_waiter *waiter)  }  void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, -			    struct thread_info *ti) +			    struct task_struct *task)  {  	SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));  	/* Mark the current thread as blocked on the lock: */ -	ti->task->blocked_on = waiter; +	task->blocked_on = waiter;  }  void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, -			 struct thread_info *ti) +			 struct task_struct *task)  {  	DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); -	DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); -	DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); -	ti->task->blocked_on = NULL; +	DEBUG_LOCKS_WARN_ON(waiter->task != task); +	DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); +	task->blocked_on = NULL;  	list_del_init(&waiter->list);  	waiter->task = NULL; diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 0799fd3e4cfa..d06ae3bb46c5 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -20,9 +20,9 @@ extern void debug_mutex_wake_waiter(struct mutex *lock,  extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);  extern void debug_mutex_add_waiter(struct mutex *lock,  				   struct mutex_waiter *waiter, -				   struct thread_info *ti); +				   struct task_struct *task);  extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, -				struct thread_info *ti); +				struct task_struct *task);  extern void debug_mutex_unlock(struct mutex *lock);  extern void debug_mutex_init(struct mutex *lock, const char *name,  			     struct lock_class_key *key); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index e364b424b019..a70b90db3909 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)  	if (!hold_ctx)  		return 0; -	if (unlikely(ctx == hold_ctx)) -		return -EALREADY; -  	if (ctx->stamp - hold_ctx->stamp <= LONG_MAX &&  	    (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) {  #ifdef CONFIG_DEBUG_MUTEXES @@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	unsigned long flags;  	int ret; +	if (use_ww_ctx) { +		struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); +		if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) +			return -EALREADY; +	} +  	preempt_disable();  	mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); @@ -534,7 +537,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  		goto skip_wait;  	debug_mutex_lock_common(lock, &waiter); -	debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); +	debug_mutex_add_waiter(lock, &waiter, task);  	/* add waiting tasks to the end of the waitqueue (FIFO): */  	list_add_tail(&waiter.list, &lock->wait_list); @@ -581,7 +584,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,  	}  	__set_task_state(task, TASK_RUNNING); -	mutex_remove_waiter(lock, &waiter, current_thread_info()); +	mutex_remove_waiter(lock, &waiter, task);  	/* set it to 0 if there are no waiters left: */  	if (likely(list_empty(&lock->wait_list)))  		atomic_set(&lock->count, 0); @@ -602,7 +605,7 @@ skip_wait:  	return 0;  err: -	mutex_remove_waiter(lock, &waiter, task_thread_info(task)); +	mutex_remove_waiter(lock, &waiter, task);  	spin_unlock_mutex(&lock->wait_lock, flags);  	debug_mutex_free_waiter(&waiter);  	mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 5cda397607f2..a68bae5e852a 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -13,7 +13,7 @@  		do { spin_lock(lock); (void)(flags); } while (0)  #define spin_unlock_mutex(lock, flags) \  		do { spin_unlock(lock); (void)(flags); } while (0) -#define mutex_remove_waiter(lock, waiter, ti) \ +#define mutex_remove_waiter(lock, waiter, task) \  		__list_del((waiter)->list.prev, (waiter)->list.next)  #ifdef CONFIG_MUTEX_SPIN_ON_OWNER diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ce2f75e32ae1..5fc8c311b8fe 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -267,6 +267,66 @@ static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,  #define queued_spin_lock_slowpath	native_queued_spin_lock_slowpath  #endif +/* + * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before + * issuing an _unordered_ store to set _Q_LOCKED_VAL. + * + * This means that the store can be delayed, but no later than the + * store-release from the unlock. This means that simply observing + * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired. + * + * There are two paths that can issue the unordered store: + * + *  (1) clear_pending_set_locked():	*,1,0 -> *,0,1 + * + *  (2) set_locked():			t,0,0 -> t,0,1 ; t != 0 + *      atomic_cmpxchg_relaxed():	t,0,0 -> 0,0,1 + * + * However, in both cases we have other !0 state we've set before to queue + * ourseves: + * + * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our + * load is constrained by that ACQUIRE to not pass before that, and thus must + * observe the store. + * + * For (2) we have a more intersting scenario. We enqueue ourselves using + * xchg_tail(), which ends up being a RELEASE. This in itself is not + * sufficient, however that is followed by an smp_cond_acquire() on the same + * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and + * guarantees we must observe that store. + * + * Therefore both cases have other !0 state that is observable before the + * unordered locked byte store comes through. This means we can use that to + * wait for the lock store, and then wait for an unlock. + */ +#ifndef queued_spin_unlock_wait +void queued_spin_unlock_wait(struct qspinlock *lock) +{ +	u32 val; + +	for (;;) { +		val = atomic_read(&lock->val); + +		if (!val) /* not locked, we're done */ +			goto done; + +		if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */ +			break; + +		/* not locked, but pending, wait until we observe the lock */ +		cpu_relax(); +	} + +	/* any unlock is good */ +	while (atomic_read(&lock->val) & _Q_LOCKED_MASK) +		cpu_relax(); + +done: +	smp_rmb(); /* CTRL + RMB -> ACQUIRE */ +} +EXPORT_SYMBOL(queued_spin_unlock_wait); +#endif +  #endif /* _GEN_PV_LOCK_SLOWPATH */  /** diff --git a/kernel/power/process.c b/kernel/power/process.c index df058bed53ce..0c2ee9761d57 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,6 +146,18 @@ int freeze_processes(void)  	if (!error && !oom_killer_disable())  		error = -EBUSY; +	/* +	 * There is a hard to fix race between oom_reaper kernel thread +	 * and oom_killer_disable. oom_reaper calls exit_oom_victim +	 * before the victim reaches exit_mm so try to freeze all the tasks +	 * again and catch such a left over task. +	 */ +	if (!error) { +		pr_info("Double checking all user space processes after OOM killer disable... "); +		error = try_to_freeze_tasks(true); +		pr_cont("\n"); +	} +  	if (error)  		thaw_processes();  	return error; diff --git a/kernel/relay.c b/kernel/relay.c index 074994bcfa9b..04d7cf3ef8cf 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -614,6 +614,7 @@ free_bufs:  	kref_put(&chan->kref, relay_destroy_channel);  	mutex_unlock(&relay_channels_mutex); +	kfree(chan);  	return NULL;  }  EXPORT_SYMBOL_GPL(relay_open); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f2cae4620c7..51d7105f529a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1536,7 +1536,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)  	for (;;) {  		/* Any allowed, online CPU? */  		for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) { -			if (!cpu_active(dest_cpu)) +			if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu)) +				continue; +			if (!cpu_online(dest_cpu))  				continue;  			goto out;  		} @@ -2253,9 +2255,11 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,  #endif  #endif +#ifdef CONFIG_SCHEDSTATS +  DEFINE_STATIC_KEY_FALSE(sched_schedstats); +static bool __initdata __sched_schedstats = false; -#ifdef CONFIG_SCHEDSTATS  static void set_schedstats(bool enabled)  {  	if (enabled) @@ -2278,11 +2282,16 @@ static int __init setup_schedstats(char *str)  	if (!str)  		goto out; +	/* +	 * This code is called before jump labels have been set up, so we can't +	 * change the static branch directly just yet.  Instead set a temporary +	 * variable so init_schedstats() can do it later. +	 */  	if (!strcmp(str, "enable")) { -		set_schedstats(true); +		__sched_schedstats = true;  		ret = 1;  	} else if (!strcmp(str, "disable")) { -		set_schedstats(false); +		__sched_schedstats = false;  		ret = 1;  	}  out: @@ -2293,6 +2302,11 @@ out:  }  __setup("schedstats=", setup_schedstats); +static void __init init_schedstats(void) +{ +	set_schedstats(__sched_schedstats); +} +  #ifdef CONFIG_PROC_SYSCTL  int sysctl_schedstats(struct ctl_table *table, int write,  			 void __user *buffer, size_t *lenp, loff_t *ppos) @@ -2313,8 +2327,10 @@ int sysctl_schedstats(struct ctl_table *table, int write,  		set_schedstats(state);  	return err;  } -#endif -#endif +#endif /* CONFIG_PROC_SYSCTL */ +#else  /* !CONFIG_SCHEDSTATS */ +static inline void init_schedstats(void) {} +#endif /* CONFIG_SCHEDSTATS */  /*   * fork()/clone()-time setup: @@ -2521,10 +2537,9 @@ void wake_up_new_task(struct task_struct *p)  	 */  	set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));  #endif -	/* Post initialize new task's util average when its cfs_rq is set */ +	rq = __task_rq_lock(p, &rf);  	post_init_entity_util_avg(&p->se); -	rq = __task_rq_lock(p, &rf);  	activate_task(rq, p, 0);  	p->on_rq = TASK_ON_RQ_QUEUED;  	trace_sched_wakeup_new(p); @@ -3156,7 +3171,8 @@ static noinline void __schedule_bug(struct task_struct *prev)  static inline void schedule_debug(struct task_struct *prev)  {  #ifdef CONFIG_SCHED_STACK_END_CHECK -	BUG_ON(task_stack_end_corrupted(prev)); +	if (task_stack_end_corrupted(prev)) +		panic("corrupted stack end detected inside scheduler\n");  #endif  	if (unlikely(in_atomic_preempt_off())) { @@ -5133,14 +5149,16 @@ void show_state_filter(unsigned long state_filter)  		/*  		 * reset the NMI-timeout, listing all files on a slow  		 * console might take a lot of time: +		 * Also, reset softlockup watchdogs on all CPUs, because +		 * another CPU might be blocked waiting for us to process +		 * an IPI.  		 */  		touch_nmi_watchdog(); +		touch_all_softlockup_watchdogs();  		if (!state_filter || (p->state & state_filter))  			sched_show_task(p);  	} -	touch_all_softlockup_watchdogs(); -  #ifdef CONFIG_SCHED_DEBUG  	if (!state_filter)  		sysrq_sched_debug_show(); @@ -7487,6 +7505,8 @@ void __init sched_init(void)  #endif  	init_sched_fair_class(); +	init_schedstats(); +  	scheduler_running = 1;  } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index cf905f655ba1..0368c393a336 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -427,19 +427,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)  		SPLIT_NS(p->se.vruntime),  		(long long)(p->nvcsw + p->nivcsw),  		p->prio); -#ifdef CONFIG_SCHEDSTATS -	if (schedstat_enabled()) { -		SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", -			SPLIT_NS(p->se.statistics.wait_sum), -			SPLIT_NS(p->se.sum_exec_runtime), -			SPLIT_NS(p->se.statistics.sum_sleep_runtime)); -	} -#else +  	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", -		0LL, 0L, +		SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),  		SPLIT_NS(p->se.sum_exec_runtime), -		0LL, 0L); -#endif +		SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime))); +  #ifdef CONFIG_NUMA_BALANCING  	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));  #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..bdcbeea90c95 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2904,6 +2904,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)  	}  } +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do {				\ +	typeof(_ptr) ptr = (_ptr);				\ +	typeof(*ptr) val = (_val);				\ +	typeof(*ptr) res, var = READ_ONCE(*ptr);		\ +	res = var - val;					\ +	if (res > var)						\ +		res = 0;					\ +	WRITE_ONCE(*ptr, res);					\ +} while (0) +  /* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */  static inline int  update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) @@ -2913,15 +2930,15 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)  	if (atomic_long_read(&cfs_rq->removed_load_avg)) {  		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); -		sa->load_avg = max_t(long, sa->load_avg - r, 0); -		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); +		sub_positive(&sa->load_avg, r); +		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);  		removed_load = 1;  	}  	if (atomic_long_read(&cfs_rq->removed_util_avg)) {  		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0); -		sa->util_avg = max_t(long, sa->util_avg - r, 0); -		sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0); +		sub_positive(&sa->util_avg, r); +		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);  		removed_util = 1;  	} @@ -2994,10 +3011,10 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s  			  &se->avg, se->on_rq * scale_load_down(se->load.weight),  			  cfs_rq->curr == se, NULL); -	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0); -	cfs_rq->avg.load_sum = max_t(s64,  cfs_rq->avg.load_sum - se->avg.load_sum, 0); -	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0); -	cfs_rq->avg.util_sum = max_t(s32,  cfs_rq->avg.util_sum - se->avg.util_sum, 0); +	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); +	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); +	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); +	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);  	cfs_rq_util_change(cfs_rq);  } @@ -3246,7 +3263,7 @@ static inline void check_schedstat_required(void)  			trace_sched_stat_iowait_enabled()  ||  			trace_sched_stat_blocked_enabled() ||  			trace_sched_stat_runtime_enabled())  { -		pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " +		printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "  			     "stat_blocked and stat_runtime require the "  			     "kernel parameter schedstats=enabled or "  			     "kernel.sched_schedstats=1\n"); @@ -4185,6 +4202,26 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)  	if (!cfs_bandwidth_used())  		return; +	/* Synchronize hierarchical throttle counter: */ +	if (unlikely(!cfs_rq->throttle_uptodate)) { +		struct rq *rq = rq_of(cfs_rq); +		struct cfs_rq *pcfs_rq; +		struct task_group *tg; + +		cfs_rq->throttle_uptodate = 1; + +		/* Get closest up-to-date node, because leaves go first: */ +		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { +			pcfs_rq = tg->cfs_rq[cpu_of(rq)]; +			if (pcfs_rq->throttle_uptodate) +				break; +		} +		if (tg) { +			cfs_rq->throttle_count = pcfs_rq->throttle_count; +			cfs_rq->throttled_clock_task = rq_clock_task(rq); +		} +	} +  	/* an active group must be handled by the update_curr()->put() path */  	if (!cfs_rq->runtime_enabled || cfs_rq->curr)  		return; @@ -4500,15 +4537,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)  		/* Don't dequeue parent if it has other entities besides us */  		if (cfs_rq->load.weight) { +			/* Avoid re-evaluating load for this entity: */ +			se = parent_entity(se);  			/*  			 * Bias pick_next to pick a task from this cfs_rq, as  			 * p is sleeping when it is within its sched_slice.  			 */ -			if (task_sleep && parent_entity(se)) -				set_next_buddy(parent_entity(se)); - -			/* avoid re-evaluating load for this entity */ -			se = parent_entity(se); +			if (task_sleep && se && !throttled_hierarchy(cfs_rq)) +				set_next_buddy(se);  			break;  		}  		flags |= DEQUEUE_SLEEP; @@ -8496,8 +8532,9 @@ void free_fair_sched_group(struct task_group *tg)  int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  { -	struct cfs_rq *cfs_rq;  	struct sched_entity *se; +	struct cfs_rq *cfs_rq; +	struct rq *rq;  	int i;  	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); @@ -8512,6 +8549,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  	init_cfs_bandwidth(tg_cfs_bandwidth(tg));  	for_each_possible_cpu(i) { +		rq = cpu_rq(i); +  		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),  				      GFP_KERNEL, cpu_to_node(i));  		if (!cfs_rq) @@ -8525,7 +8564,10 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)  		init_cfs_rq(cfs_rq);  		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);  		init_entity_runnable_average(se); + +		raw_spin_lock_irq(&rq->lock);  		post_init_entity_util_avg(se); +		raw_spin_unlock_irq(&rq->lock);  	}  	return 1; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..c5aeedf4e93a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -127,7 +127,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,   */  static void cpuidle_idle_call(void)  { -	struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); +	struct cpuidle_device *dev = cpuidle_get_device();  	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);  	int next_state, entered_state; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..7cbeb92a1cb9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -437,7 +437,7 @@ struct cfs_rq {  	u64 throttled_clock, throttled_clock_task;  	u64 throttled_clock_task_time; -	int throttled, throttle_count; +	int throttled, throttle_count, throttle_uptodate;  	struct list_head throttled_list;  #endif /* CONFIG_CFS_BANDWIDTH */  #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 70b3b6a20fb0..78955cbea31c 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -33,6 +33,8 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)  # define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)  # define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)  # define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0) +# define schedstat_val(rq, field)	((schedstat_enabled()) ? (rq)->field : 0) +  #else /* !CONFIG_SCHEDSTATS */  static inline void  rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -47,6 +49,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)  # define schedstat_inc(rq, field)	do { } while (0)  # define schedstat_add(rq, field, amt)	do { } while (0)  # define schedstat_set(var, val)	do { } while (0) +# define schedstat_val(rq, field)	0  #endif  #ifdef CONFIG_SCHED_INFO diff --git a/kernel/signal.c b/kernel/signal.c index 96e9bc40667f..af21afc00d08 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2751,23 +2751,18 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)   *  @ts: upper bound on process time suspension   */  int do_sigtimedwait(const sigset_t *which, siginfo_t *info, -			const struct timespec *ts) +		    const struct timespec *ts)  { +	ktime_t *to = NULL, timeout = { .tv64 = KTIME_MAX };  	struct task_struct *tsk = current; -	long timeout = MAX_SCHEDULE_TIMEOUT;  	sigset_t mask = *which; -	int sig; +	int sig, ret = 0;  	if (ts) {  		if (!timespec_valid(ts))  			return -EINVAL; -		timeout = timespec_to_jiffies(ts); -		/* -		 * We can be close to the next tick, add another one -		 * to ensure we will wait at least the time asked for. -		 */ -		if (ts->tv_sec || ts->tv_nsec) -			timeout++; +		timeout = timespec_to_ktime(*ts); +		to = &timeout;  	}  	/* @@ -2778,7 +2773,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  	spin_lock_irq(&tsk->sighand->siglock);  	sig = dequeue_signal(tsk, &mask, info); -	if (!sig && timeout) { +	if (!sig && timeout.tv64) {  		/*  		 * None ready, temporarily unblock those we're interested  		 * while we are sleeping in so that we'll be awakened when @@ -2790,8 +2785,9 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  		recalc_sigpending();  		spin_unlock_irq(&tsk->sighand->siglock); -		timeout = freezable_schedule_timeout_interruptible(timeout); - +		__set_current_state(TASK_INTERRUPTIBLE); +		ret = freezable_schedule_hrtimeout_range(to, tsk->timer_slack_ns, +							 HRTIMER_MODE_REL);  		spin_lock_irq(&tsk->sighand->siglock);  		__set_task_blocked(tsk, &tsk->real_blocked);  		sigemptyset(&tsk->real_blocked); @@ -2801,7 +2797,7 @@ int do_sigtimedwait(const sigset_t *which, siginfo_t *info,  	if (sig)  		return sig; -	return timeout ? -EINTR : -EAGAIN; +	return ret ? -EINTR : -EAGAIN;  }  /** diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 966a5a6fdd0a..f738251000fe 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -164,3 +164,4 @@ static inline void timers_update_migration(bool update_nohz) { }  DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);  extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); +void timer_clear_idle(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index db57d1ba73eb..2ec7c00228f3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -700,6 +700,12 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,  	delta = next_tick - basemono;  	if (delta <= (u64)TICK_NSEC) {  		tick.tv64 = 0; + +		/* +		 * Tell the timer code that the base is not idle, i.e. undo +		 * the effect of get_next_timer_interrupt(): +		 */ +		timer_clear_idle();  		/*  		 * We've not stopped the tick yet, and there's a timer in the  		 * next period, so no point in stopping it either, bail. @@ -809,6 +815,12 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)  	tick_do_update_jiffies64(now);  	cpu_load_update_nohz_stop(); +	/* +	 * Clear the timer idle flag, so we avoid IPIs on remote queueing and +	 * the clock forward checks in the enqueue path: +	 */ +	timer_clear_idle(); +  	calc_load_exit_idle();  	touch_softlockup_watchdog_sched();  	/* @@ -1092,35 +1104,6 @@ static void tick_nohz_switch_to_nohz(void)  	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);  } -/* - * When NOHZ is enabled and the tick is stopped, we need to kick the - * tick timer from irq_enter() so that the jiffies update is kept - * alive during long running softirqs. That's ugly as hell, but - * correctness is key even if we need to fix the offending softirq in - * the first place. - * - * Note, this is different to tick_nohz_restart. We just kick the - * timer and do not touch the other magic bits which need to be done - * when idle is left. - */ -static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now) -{ -#if 0 -	/* Switch back to 2.6.27 behaviour */ -	ktime_t delta; - -	/* -	 * Do not touch the tick device, when the next expiry is either -	 * already reached or less/equal than the tick period. -	 */ -	delta =	ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); -	if (delta.tv64 <= tick_period.tv64) -		return; - -	tick_nohz_restart(ts, now); -#endif -} -  static inline void tick_nohz_irq_enter(void)  {  	struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); @@ -1131,10 +1114,8 @@ static inline void tick_nohz_irq_enter(void)  	now = ktime_get();  	if (ts->idle_active)  		tick_nohz_stop_idle(ts, now); -	if (ts->tick_stopped) { +	if (ts->tick_stopped)  		tick_nohz_update_jiffies(now); -		tick_nohz_kick_tick(ts, now); -	}  }  #else diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 67dd6103003a..cb9ab401e2d9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -59,43 +59,153 @@ __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;  EXPORT_SYMBOL(jiffies_64);  /* - * per-CPU timer vector definitions: + * The timer wheel has LVL_DEPTH array levels. Each level provides an array of + * LVL_SIZE buckets. Each level is driven by its own clock and therefor each + * level has a different granularity. + * + * The level granularity is:		LVL_CLK_DIV ^ lvl + * The level clock frequency is:	HZ / (LVL_CLK_DIV ^ level) + * + * The array level of a newly armed timer depends on the relative expiry + * time. The farther the expiry time is away the higher the array level and + * therefor the granularity becomes. + * + * Contrary to the original timer wheel implementation, which aims for 'exact' + * expiry of the timers, this implementation removes the need for recascading + * the timers into the lower array levels. The previous 'classic' timer wheel + * implementation of the kernel already violated the 'exact' expiry by adding + * slack to the expiry time to provide batched expiration. The granularity + * levels provide implicit batching. + * + * This is an optimization of the original timer wheel implementation for the + * majority of the timer wheel use cases: timeouts. The vast majority of + * timeout timers (networking, disk I/O ...) are canceled before expiry. If + * the timeout expires it indicates that normal operation is disturbed, so it + * does not matter much whether the timeout comes with a slight delay. + * + * The only exception to this are networking timers with a small expiry + * time. They rely on the granularity. Those fit into the first wheel level, + * which has HZ granularity. + * + * We don't have cascading anymore. timers with a expiry time above the + * capacity of the last wheel level are force expired at the maximum timeout + * value of the last wheel level. From data sampling we know that the maximum + * value observed is 5 days (network connection tracking), so this should not + * be an issue. + * + * The currently chosen array constants values are a good compromise between + * array size and granularity. + * + * This results in the following granularity and range levels: + * + * HZ 1000 steps + * Level Offset  Granularity            Range + *  0      0         1 ms                0 ms -         63 ms + *  1     64         8 ms               64 ms -        511 ms + *  2    128        64 ms              512 ms -       4095 ms (512ms - ~4s) + *  3    192       512 ms             4096 ms -      32767 ms (~4s - ~32s) + *  4    256      4096 ms (~4s)      32768 ms -     262143 ms (~32s - ~4m) + *  5    320     32768 ms (~32s)    262144 ms -    2097151 ms (~4m - ~34m) + *  6    384    262144 ms (~4m)    2097152 ms -   16777215 ms (~34m - ~4h) + *  7    448   2097152 ms (~34m)  16777216 ms -  134217727 ms (~4h - ~1d) + *  8    512  16777216 ms (~4h)  134217728 ms - 1073741822 ms (~1d - ~12d) + * + * HZ  300 + * Level Offset  Granularity            Range + *  0	   0         3 ms                0 ms -        210 ms + *  1	  64        26 ms              213 ms -       1703 ms (213ms - ~1s) + *  2	 128       213 ms             1706 ms -      13650 ms (~1s - ~13s) + *  3	 192      1706 ms (~1s)      13653 ms -     109223 ms (~13s - ~1m) + *  4	 256     13653 ms (~13s)    109226 ms -     873810 ms (~1m - ~14m) + *  5	 320    109226 ms (~1m)     873813 ms -    6990503 ms (~14m - ~1h) + *  6	 384    873813 ms (~14m)   6990506 ms -   55924050 ms (~1h - ~15h) + *  7	 448   6990506 ms (~1h)   55924053 ms -  447392423 ms (~15h - ~5d) + *  8    512  55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) + * + * HZ  250 + * Level Offset  Granularity            Range + *  0	   0         4 ms                0 ms -        255 ms + *  1	  64        32 ms              256 ms -       2047 ms (256ms - ~2s) + *  2	 128       256 ms             2048 ms -      16383 ms (~2s - ~16s) + *  3	 192      2048 ms (~2s)      16384 ms -     131071 ms (~16s - ~2m) + *  4	 256     16384 ms (~16s)    131072 ms -    1048575 ms (~2m - ~17m) + *  5	 320    131072 ms (~2m)    1048576 ms -    8388607 ms (~17m - ~2h) + *  6	 384   1048576 ms (~17m)   8388608 ms -   67108863 ms (~2h - ~18h) + *  7	 448   8388608 ms (~2h)   67108864 ms -  536870911 ms (~18h - ~6d) + *  8    512  67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) + * + * HZ  100 + * Level Offset  Granularity            Range + *  0	   0         10 ms               0 ms -        630 ms + *  1	  64         80 ms             640 ms -       5110 ms (640ms - ~5s) + *  2	 128        640 ms            5120 ms -      40950 ms (~5s - ~40s) + *  3	 192       5120 ms (~5s)     40960 ms -     327670 ms (~40s - ~5m) + *  4	 256      40960 ms (~40s)   327680 ms -    2621430 ms (~5m - ~43m) + *  5	 320     327680 ms (~5m)   2621440 ms -   20971510 ms (~43m - ~5h) + *  6	 384    2621440 ms (~43m) 20971520 ms -  167772150 ms (~5h - ~1d) + *  7	 448   20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)   */ -#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) -#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) -#define TVN_SIZE (1 << TVN_BITS) -#define TVR_SIZE (1 << TVR_BITS) -#define TVN_MASK (TVN_SIZE - 1) -#define TVR_MASK (TVR_SIZE - 1) -#define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) - -struct tvec { -	struct hlist_head vec[TVN_SIZE]; -}; -struct tvec_root { -	struct hlist_head vec[TVR_SIZE]; -}; +/* Clock divisor for the next level */ +#define LVL_CLK_SHIFT	3 +#define LVL_CLK_DIV	(1UL << LVL_CLK_SHIFT) +#define LVL_CLK_MASK	(LVL_CLK_DIV - 1) +#define LVL_SHIFT(n)	((n) * LVL_CLK_SHIFT) +#define LVL_GRAN(n)	(1UL << LVL_SHIFT(n)) -struct tvec_base { -	spinlock_t lock; -	struct timer_list *running_timer; -	unsigned long timer_jiffies; -	unsigned long next_timer; -	unsigned long active_timers; -	unsigned long all_timers; -	int cpu; -	bool migration_enabled; -	bool nohz_active; -	struct tvec_root tv1; -	struct tvec tv2; -	struct tvec tv3; -	struct tvec tv4; -	struct tvec tv5; -} ____cacheline_aligned; +/* + * The time start value for each level to select the bucket at enqueue + * time. + */ +#define LVL_START(n)	((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) + +/* Size of each clock level */ +#define LVL_BITS	6 +#define LVL_SIZE	(1UL << LVL_BITS) +#define LVL_MASK	(LVL_SIZE - 1) +#define LVL_OFFS(n)	((n) * LVL_SIZE) + +/* Level depth */ +#if HZ > 100 +# define LVL_DEPTH	9 +# else +# define LVL_DEPTH	8 +#endif + +/* The cutoff (max. capacity of the wheel) */ +#define WHEEL_TIMEOUT_CUTOFF	(LVL_START(LVL_DEPTH)) +#define WHEEL_TIMEOUT_MAX	(WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) + +/* + * The resulting wheel size. If NOHZ is configured we allocate two + * wheels so we have a separate storage for the deferrable timers. + */ +#define WHEEL_SIZE	(LVL_SIZE * LVL_DEPTH) + +#ifdef CONFIG_NO_HZ_COMMON +# define NR_BASES	2 +# define BASE_STD	0 +# define BASE_DEF	1 +#else +# define NR_BASES	1 +# define BASE_STD	0 +# define BASE_DEF	0 +#endif +struct timer_base { +	spinlock_t		lock; +	struct timer_list	*running_timer; +	unsigned long		clk; +	unsigned long		next_expiry; +	unsigned int		cpu; +	bool			migration_enabled; +	bool			nohz_active; +	bool			is_idle; +	DECLARE_BITMAP(pending_map, WHEEL_SIZE); +	struct hlist_head	vectors[WHEEL_SIZE]; +} ____cacheline_aligned; -static DEFINE_PER_CPU(struct tvec_base, tvec_bases); +static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)  unsigned int sysctl_timer_migration = 1; @@ -106,15 +216,17 @@ void timers_update_migration(bool update_nohz)  	unsigned int cpu;  	/* Avoid the loop, if nothing to update */ -	if (this_cpu_read(tvec_bases.migration_enabled) == on) +	if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on)  		return;  	for_each_possible_cpu(cpu) { -		per_cpu(tvec_bases.migration_enabled, cpu) = on; +		per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; +		per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on;  		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;  		if (!update_nohz)  			continue; -		per_cpu(tvec_bases.nohz_active, cpu) = true; +		per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; +		per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true;  		per_cpu(hrtimer_bases.nohz_active, cpu) = true;  	}  } @@ -133,20 +245,6 @@ int timer_migration_handler(struct ctl_table *table, int write,  	mutex_unlock(&mutex);  	return ret;  } - -static inline struct tvec_base *get_target_base(struct tvec_base *base, -						int pinned) -{ -	if (pinned || !base->migration_enabled) -		return this_cpu_ptr(&tvec_bases); -	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); -} -#else -static inline struct tvec_base *get_target_base(struct tvec_base *base, -						int pinned) -{ -	return this_cpu_ptr(&tvec_bases); -}  #endif  static unsigned long round_jiffies_common(unsigned long j, int cpu, @@ -351,101 +449,126 @@ unsigned long round_jiffies_up_relative(unsigned long j)  }  EXPORT_SYMBOL_GPL(round_jiffies_up_relative); -/** - * set_timer_slack - set the allowed slack for a timer - * @timer: the timer to be modified - * @slack_hz: the amount of time (in jiffies) allowed for rounding - * - * Set the amount of time, in jiffies, that a certain timer has - * in terms of slack. By setting this value, the timer subsystem - * will schedule the actual timer somewhere between - * the time mod_timer() asks for, and that time plus the slack. - * - * By setting the slack to -1, a percentage of the delay is used - * instead. - */ -void set_timer_slack(struct timer_list *timer, int slack_hz) + +static inline unsigned int timer_get_idx(struct timer_list *timer)  { -	timer->slack = slack_hz; +	return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;  } -EXPORT_SYMBOL_GPL(set_timer_slack); -static void -__internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)  { -	unsigned long expires = timer->expires; -	unsigned long idx = expires - base->timer_jiffies; -	struct hlist_head *vec; +	timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | +			idx << TIMER_ARRAYSHIFT; +} -	if (idx < TVR_SIZE) { -		int i = expires & TVR_MASK; -		vec = base->tv1.vec + i; -	} else if (idx < 1 << (TVR_BITS + TVN_BITS)) { -		int i = (expires >> TVR_BITS) & TVN_MASK; -		vec = base->tv2.vec + i; -	} else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { -		int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; -		vec = base->tv3.vec + i; -	} else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { -		int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; -		vec = base->tv4.vec + i; -	} else if ((signed long) idx < 0) { -		/* -		 * Can happen if you add a timer with expires == jiffies, -		 * or you set a timer to go off in the past -		 */ -		vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); +/* + * Helper function to calculate the array index for a given expiry + * time. + */ +static inline unsigned calc_index(unsigned expires, unsigned lvl) +{ +	expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); +	return LVL_OFFS(lvl) + (expires & LVL_MASK); +} + +static int calc_wheel_index(unsigned long expires, unsigned long clk) +{ +	unsigned long delta = expires - clk; +	unsigned int idx; + +	if (delta < LVL_START(1)) { +		idx = calc_index(expires, 0); +	} else if (delta < LVL_START(2)) { +		idx = calc_index(expires, 1); +	} else if (delta < LVL_START(3)) { +		idx = calc_index(expires, 2); +	} else if (delta < LVL_START(4)) { +		idx = calc_index(expires, 3); +	} else if (delta < LVL_START(5)) { +		idx = calc_index(expires, 4); +	} else if (delta < LVL_START(6)) { +		idx = calc_index(expires, 5); +	} else if (delta < LVL_START(7)) { +		idx = calc_index(expires, 6); +	} else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { +		idx = calc_index(expires, 7); +	} else if ((long) delta < 0) { +		idx = clk & LVL_MASK;  	} else { -		int i; -		/* If the timeout is larger than MAX_TVAL (on 64-bit -		 * architectures or with CONFIG_BASE_SMALL=1) then we -		 * use the maximum timeout. +		/* +		 * Force expire obscene large timeouts to expire at the +		 * capacity limit of the wheel.  		 */ -		if (idx > MAX_TVAL) { -			idx = MAX_TVAL; -			expires = idx + base->timer_jiffies; -		} -		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; -		vec = base->tv5.vec + i; +		if (expires >= WHEEL_TIMEOUT_CUTOFF) +			expires = WHEEL_TIMEOUT_MAX; + +		idx = calc_index(expires, LVL_DEPTH - 1);  	} +	return idx; +} -	hlist_add_head(&timer->entry, vec); +/* + * Enqueue the timer into the hash bucket, mark it pending in + * the bitmap and store the index in the timer flags. + */ +static void enqueue_timer(struct timer_base *base, struct timer_list *timer, +			  unsigned int idx) +{ +	hlist_add_head(&timer->entry, base->vectors + idx); +	__set_bit(idx, base->pending_map); +	timer_set_idx(timer, idx);  } -static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +static void +__internal_add_timer(struct timer_base *base, struct timer_list *timer)  { -	/* Advance base->jiffies, if the base is empty */ -	if (!base->all_timers++) -		base->timer_jiffies = jiffies; +	unsigned int idx; + +	idx = calc_wheel_index(timer->expires, base->clk); +	enqueue_timer(base, timer, idx); +} + +static void +trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) +{ +	if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) +		return; -	__internal_add_timer(base, timer);  	/* -	 * Update base->active_timers and base->next_timer +	 * TODO: This wants some optimizing similar to the code below, but we +	 * will do that when we switch from push to pull for deferrable timers.  	 */ -	if (!(timer->flags & TIMER_DEFERRABLE)) { -		if (!base->active_timers++ || -		    time_before(timer->expires, base->next_timer)) -			base->next_timer = timer->expires; +	if (timer->flags & TIMER_DEFERRABLE) { +		if (tick_nohz_full_cpu(base->cpu)) +			wake_up_nohz_cpu(base->cpu); +		return;  	}  	/* -	 * Check whether the other CPU is in dynticks mode and needs -	 * to be triggered to reevaluate the timer wheel. -	 * We are protected against the other CPU fiddling -	 * with the timer by holding the timer base lock. This also -	 * makes sure that a CPU on the way to stop its tick can not -	 * evaluate the timer wheel. -	 * -	 * Spare the IPI for deferrable timers on idle targets though. -	 * The next busy ticks will take care of it. Except full dynticks -	 * require special care against races with idle_cpu(), lets deal -	 * with that later. +	 * We might have to IPI the remote CPU if the base is idle and the +	 * timer is not deferrable. If the other CPU is on the way to idle +	 * then it can't set base->is_idle as we hold the base lock:  	 */ -	if (base->nohz_active) { -		if (!(timer->flags & TIMER_DEFERRABLE) || -		    tick_nohz_full_cpu(base->cpu)) -			wake_up_nohz_cpu(base->cpu); -	} +	if (!base->is_idle) +		return; + +	/* Check whether this is the new first expiring timer: */ +	if (time_after_eq(timer->expires, base->next_expiry)) +		return; + +	/* +	 * Set the next expiry time and kick the CPU so it can reevaluate the +	 * wheel: +	 */ +	base->next_expiry = timer->expires; +		wake_up_nohz_cpu(base->cpu); +} + +static void +internal_add_timer(struct timer_base *base, struct timer_list *timer) +{ +	__internal_add_timer(base, timer); +	trigger_dyntick_cpu(base, timer);  }  #ifdef CONFIG_TIMER_STATS @@ -666,7 +789,6 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,  {  	timer->entry.pprev = NULL;  	timer->flags = flags | raw_smp_processor_id(); -	timer->slack = -1;  #ifdef CONFIG_TIMER_STATS  	timer->start_site = NULL;  	timer->start_pid = -1; @@ -706,54 +828,125 @@ static inline void detach_timer(struct timer_list *timer, bool clear_pending)  	entry->next = LIST_POISON2;  } -static inline void -detach_expired_timer(struct timer_list *timer, struct tvec_base *base) -{ -	detach_timer(timer, true); -	if (!(timer->flags & TIMER_DEFERRABLE)) -		base->active_timers--; -	base->all_timers--; -} - -static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, +static int detach_if_pending(struct timer_list *timer, struct timer_base *base,  			     bool clear_pending)  { +	unsigned idx = timer_get_idx(timer); +  	if (!timer_pending(timer))  		return 0; +	if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) +		__clear_bit(idx, base->pending_map); +  	detach_timer(timer, clear_pending); -	if (!(timer->flags & TIMER_DEFERRABLE)) { -		base->active_timers--; -		if (timer->expires == base->next_timer) -			base->next_timer = base->timer_jiffies; -	} -	/* If this was the last timer, advance base->jiffies */ -	if (!--base->all_timers) -		base->timer_jiffies = jiffies;  	return 1;  } +static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) +{ +	struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); + +	/* +	 * If the timer is deferrable and nohz is active then we need to use +	 * the deferrable base. +	 */ +	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && +	    (tflags & TIMER_DEFERRABLE)) +		base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); +	return base; +} + +static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) +{ +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +	/* +	 * If the timer is deferrable and nohz is active then we need to use +	 * the deferrable base. +	 */ +	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && +	    (tflags & TIMER_DEFERRABLE)) +		base = this_cpu_ptr(&timer_bases[BASE_DEF]); +	return base; +} + +static inline struct timer_base *get_timer_base(u32 tflags) +{ +	return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); +} + +#ifdef CONFIG_NO_HZ_COMMON +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ +#ifdef CONFIG_SMP +	if ((tflags & TIMER_PINNED) || !base->migration_enabled) +		return get_timer_this_cpu_base(tflags); +	return get_timer_cpu_base(tflags, get_nohz_timer_target()); +#else +	return get_timer_this_cpu_base(tflags); +#endif +} + +static inline void forward_timer_base(struct timer_base *base) +{ +	/* +	 * We only forward the base when it's idle and we have a delta between +	 * base clock and jiffies. +	 */ +	if (!base->is_idle || (long) (jiffies - base->clk) < 2) +		return; + +	/* +	 * If the next expiry value is > jiffies, then we fast forward to +	 * jiffies otherwise we forward to the next expiry value. +	 */ +	if (time_after(base->next_expiry, jiffies)) +		base->clk = jiffies; +	else +		base->clk = base->next_expiry; +} +#else +static inline struct timer_base * +__get_target_base(struct timer_base *base, unsigned tflags) +{ +	return get_timer_this_cpu_base(tflags); +} + +static inline void forward_timer_base(struct timer_base *base) { } +#endif + +static inline struct timer_base * +get_target_base(struct timer_base *base, unsigned tflags) +{ +	struct timer_base *target = __get_target_base(base, tflags); + +	forward_timer_base(target); +	return target; +} +  /* - * We are using hashed locking: holding per_cpu(tvec_bases).lock - * means that all timers which are tied to this base via timer->base are - * locked, and the base itself is locked too. + * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means + * that all timers which are tied to this base are locked, and the base itself + * is locked too.   *   * So __run_timers/migrate_timers can safely modify all timers which could - * be found on ->tvX lists. + * be found in the base->vectors array.   * - * When the timer's base is locked and removed from the list, the - * TIMER_MIGRATING flag is set, FIXME + * When a timer is migrating then the TIMER_MIGRATING flag is set and we need + * to wait until the migration is done.   */ -static struct tvec_base *lock_timer_base(struct timer_list *timer, -					unsigned long *flags) +static struct timer_base *lock_timer_base(struct timer_list *timer, +					  unsigned long *flags)  	__acquires(timer->base->lock)  {  	for (;;) { +		struct timer_base *base;  		u32 tf = timer->flags; -		struct tvec_base *base;  		if (!(tf & TIMER_MIGRATING)) { -			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); +			base = get_timer_base(tf);  			spin_lock_irqsave(&base->lock, *flags);  			if (timer->flags == tf)  				return base; @@ -764,13 +957,41 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,  }  static inline int -__mod_timer(struct timer_list *timer, unsigned long expires, -	    bool pending_only, int pinned) +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)  { -	struct tvec_base *base, *new_base; -	unsigned long flags; +	struct timer_base *base, *new_base; +	unsigned int idx = UINT_MAX; +	unsigned long clk = 0, flags;  	int ret = 0; +	/* +	 * This is a common optimization triggered by the networking code - if +	 * the timer is re-modified to have the same timeout or ends up in the +	 * same array bucket then just return: +	 */ +	if (timer_pending(timer)) { +		if (timer->expires == expires) +			return 1; +		/* +		 * Take the current timer_jiffies of base, but without holding +		 * the lock! +		 */ +		base = get_timer_base(timer->flags); +		clk = base->clk; + +		idx = calc_wheel_index(expires, clk); + +		/* +		 * Retrieve and compare the array index of the pending +		 * timer. If it matches set the expiry to the new value so a +		 * subsequent call will exit in the expires check above. +		 */ +		if (idx == timer_get_idx(timer)) { +			timer->expires = expires; +			return 1; +		} +	} +  	timer_stats_timer_set_start_info(timer);  	BUG_ON(!timer->function); @@ -782,15 +1003,15 @@ __mod_timer(struct timer_list *timer, unsigned long expires,  	debug_activate(timer, expires); -	new_base = get_target_base(base, pinned); +	new_base = get_target_base(base, timer->flags);  	if (base != new_base) {  		/* -		 * We are trying to schedule the timer on the local CPU. +		 * We are trying to schedule the timer on the new base.  		 * However we can't change timer's base while it is running,  		 * otherwise del_timer_sync() can't detect that the timer's -		 * handler yet has not finished. This also guarantees that -		 * the timer is serialized wrt itself. +		 * handler yet has not finished. This also guarantees that the +		 * timer is serialized wrt itself.  		 */  		if (likely(base->running_timer != timer)) {  			/* See the comment in lock_timer_base() */ @@ -805,7 +1026,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires,  	}  	timer->expires = expires; -	internal_add_timer(base, timer); +	/* +	 * If 'idx' was calculated above and the base time did not advance +	 * between calculating 'idx' and taking the lock, only enqueue_timer() +	 * and trigger_dyntick_cpu() is required. Otherwise we need to +	 * (re)calculate the wheel index via internal_add_timer(). +	 */ +	if (idx != UINT_MAX && clk == base->clk) { +		enqueue_timer(base, timer, idx); +		trigger_dyntick_cpu(base, timer); +	} else { +		internal_add_timer(base, timer); +	}  out_unlock:  	spin_unlock_irqrestore(&base->lock, flags); @@ -825,49 +1057,10 @@ out_unlock:   */  int mod_timer_pending(struct timer_list *timer, unsigned long expires)  { -	return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); +	return __mod_timer(timer, expires, true);  }  EXPORT_SYMBOL(mod_timer_pending); -/* - * Decide where to put the timer while taking the slack into account - * - * Algorithm: - *   1) calculate the maximum (absolute) time - *   2) calculate the highest bit where the expires and new max are different - *   3) use this bit to make a mask - *   4) use the bitmask to round down the maximum time, so that all last - *      bits are zeros - */ -static inline -unsigned long apply_slack(struct timer_list *timer, unsigned long expires) -{ -	unsigned long expires_limit, mask; -	int bit; - -	if (timer->slack >= 0) { -		expires_limit = expires + timer->slack; -	} else { -		long delta = expires - jiffies; - -		if (delta < 256) -			return expires; - -		expires_limit = expires + delta / 256; -	} -	mask = expires ^ expires_limit; -	if (mask == 0) -		return expires; - -	bit = __fls(mask); - -	mask = (1UL << bit) - 1; - -	expires_limit = expires_limit & ~(mask); - -	return expires_limit; -} -  /**   * mod_timer - modify a timer's timeout   * @timer: the timer to be modified @@ -890,49 +1083,11 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires)   */  int mod_timer(struct timer_list *timer, unsigned long expires)  { -	expires = apply_slack(timer, expires); - -	/* -	 * This is a common optimization triggered by the -	 * networking code - if the timer is re-modified -	 * to be the same thing then just return: -	 */ -	if (timer_pending(timer) && timer->expires == expires) -		return 1; - -	return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); +	return __mod_timer(timer, expires, false);  }  EXPORT_SYMBOL(mod_timer);  /** - * mod_timer_pinned - modify a timer's timeout - * @timer: the timer to be modified - * @expires: new timeout in jiffies - * - * mod_timer_pinned() is a way to update the expire field of an - * active timer (if the timer is inactive it will be activated) - * and to ensure that the timer is scheduled on the current CPU. - * - * Note that this does not prevent the timer from being migrated - * when the current CPU goes offline.  If this is a problem for - * you, use CPU-hotplug notifiers to handle it correctly, for - * example, cancelling the timer when the corresponding CPU goes - * offline. - * - * mod_timer_pinned(timer, expires) is equivalent to: - * - *     del_timer(timer); timer->expires = expires; add_timer(timer); - */ -int mod_timer_pinned(struct timer_list *timer, unsigned long expires) -{ -	if (timer->expires == expires && timer_pending(timer)) -		return 1; - -	return __mod_timer(timer, expires, false, TIMER_PINNED); -} -EXPORT_SYMBOL(mod_timer_pinned); - -/**   * add_timer - start a timer   * @timer: the timer to be added   * @@ -962,13 +1117,14 @@ EXPORT_SYMBOL(add_timer);   */  void add_timer_on(struct timer_list *timer, int cpu)  { -	struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); -	struct tvec_base *base; +	struct timer_base *new_base, *base;  	unsigned long flags;  	timer_stats_timer_set_start_info(timer);  	BUG_ON(timer_pending(timer) || !timer->function); +	new_base = get_timer_cpu_base(timer->flags, cpu); +  	/*  	 * If @timer was on a different CPU, it should be migrated with the  	 * old base locked to prevent other operations proceeding with the @@ -1004,7 +1160,7 @@ EXPORT_SYMBOL_GPL(add_timer_on);   */  int del_timer(struct timer_list *timer)  { -	struct tvec_base *base; +	struct timer_base *base;  	unsigned long flags;  	int ret = 0; @@ -1030,7 +1186,7 @@ EXPORT_SYMBOL(del_timer);   */  int try_to_del_timer_sync(struct timer_list *timer)  { -	struct tvec_base *base; +	struct timer_base *base;  	unsigned long flags;  	int ret = -1; @@ -1114,27 +1270,6 @@ int del_timer_sync(struct timer_list *timer)  EXPORT_SYMBOL(del_timer_sync);  #endif -static int cascade(struct tvec_base *base, struct tvec *tv, int index) -{ -	/* cascade all the timers from tv up one level */ -	struct timer_list *timer; -	struct hlist_node *tmp; -	struct hlist_head tv_list; - -	hlist_move_list(tv->vec + index, &tv_list); - -	/* -	 * We are removing _all_ timers from the list, so we -	 * don't have to detach them individually. -	 */ -	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { -		/* No accounting, while moving them */ -		__internal_add_timer(base, timer); -	} - -	return index; -} -  static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),  			  unsigned long data)  { @@ -1178,147 +1313,141 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),  	}  } -#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) - -/** - * __run_timers - run all expired timers (if any) on this CPU. - * @base: the timer vector to be processed. - * - * This function cascades all vectors and executes all expired timer - * vectors. - */ -static inline void __run_timers(struct tvec_base *base) +static void expire_timers(struct timer_base *base, struct hlist_head *head)  { -	struct timer_list *timer; +	while (!hlist_empty(head)) { +		struct timer_list *timer; +		void (*fn)(unsigned long); +		unsigned long data; -	spin_lock_irq(&base->lock); +		timer = hlist_entry(head->first, struct timer_list, entry); +		timer_stats_account_timer(timer); -	while (time_after_eq(jiffies, base->timer_jiffies)) { -		struct hlist_head work_list; -		struct hlist_head *head = &work_list; -		int index; +		base->running_timer = timer; +		detach_timer(timer, true); -		if (!base->all_timers) { -			base->timer_jiffies = jiffies; -			break; +		fn = timer->function; +		data = timer->data; + +		if (timer->flags & TIMER_IRQSAFE) { +			spin_unlock(&base->lock); +			call_timer_fn(timer, fn, data); +			spin_lock(&base->lock); +		} else { +			spin_unlock_irq(&base->lock); +			call_timer_fn(timer, fn, data); +			spin_lock_irq(&base->lock);  		} +	} +} -		index = base->timer_jiffies & TVR_MASK; +static int __collect_expired_timers(struct timer_base *base, +				    struct hlist_head *heads) +{ +	unsigned long clk = base->clk; +	struct hlist_head *vec; +	int i, levels = 0; +	unsigned int idx; -		/* -		 * Cascade timers: -		 */ -		if (!index && -			(!cascade(base, &base->tv2, INDEX(0))) && -				(!cascade(base, &base->tv3, INDEX(1))) && -					!cascade(base, &base->tv4, INDEX(2))) -			cascade(base, &base->tv5, INDEX(3)); -		++base->timer_jiffies; -		hlist_move_list(base->tv1.vec + index, head); -		while (!hlist_empty(head)) { -			void (*fn)(unsigned long); -			unsigned long data; -			bool irqsafe; - -			timer = hlist_entry(head->first, struct timer_list, entry); -			fn = timer->function; -			data = timer->data; -			irqsafe = timer->flags & TIMER_IRQSAFE; - -			timer_stats_account_timer(timer); - -			base->running_timer = timer; -			detach_expired_timer(timer, base); - -			if (irqsafe) { -				spin_unlock(&base->lock); -				call_timer_fn(timer, fn, data); -				spin_lock(&base->lock); -			} else { -				spin_unlock_irq(&base->lock); -				call_timer_fn(timer, fn, data); -				spin_lock_irq(&base->lock); -			} +	for (i = 0; i < LVL_DEPTH; i++) { +		idx = (clk & LVL_MASK) + i * LVL_SIZE; + +		if (__test_and_clear_bit(idx, base->pending_map)) { +			vec = base->vectors + idx; +			hlist_move_list(vec, heads++); +			levels++;  		} +		/* Is it time to look at the next level? */ +		if (clk & LVL_CLK_MASK) +			break; +		/* Shift clock for the next level granularity */ +		clk >>= LVL_CLK_SHIFT;  	} -	base->running_timer = NULL; -	spin_unlock_irq(&base->lock); +	return levels;  }  #ifdef CONFIG_NO_HZ_COMMON  /* - * Find out when the next timer event is due to happen. This - * is used on S/390 to stop all activity when a CPU is idle. - * This function needs to be called with interrupts disabled. + * Find the next pending bucket of a level. Search from level start (@offset) + * + @clk upwards and if nothing there, search from start of the level + * (@offset) up to @offset + clk.   */ -static unsigned long __next_timer_interrupt(struct tvec_base *base) -{ -	unsigned long timer_jiffies = base->timer_jiffies; -	unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; -	int index, slot, array, found = 0; -	struct timer_list *nte; -	struct tvec *varray[4]; - -	/* Look for timer events in tv1. */ -	index = slot = timer_jiffies & TVR_MASK; -	do { -		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { -			if (nte->flags & TIMER_DEFERRABLE) -				continue; - -			found = 1; -			expires = nte->expires; -			/* Look at the cascade bucket(s)? */ -			if (!index || slot < index) -				goto cascade; -			return expires; +static int next_pending_bucket(struct timer_base *base, unsigned offset, +			       unsigned clk) +{ +	unsigned pos, start = offset + clk; +	unsigned end = offset + LVL_SIZE; + +	pos = find_next_bit(base->pending_map, end, start); +	if (pos < end) +		return pos - start; + +	pos = find_next_bit(base->pending_map, start, offset); +	return pos < start ? pos + LVL_SIZE - start : -1; +} + +/* + * Search the first expiring timer in the various clock levels. Caller must + * hold base->lock. + */ +static unsigned long __next_timer_interrupt(struct timer_base *base) +{ +	unsigned long clk, next, adj; +	unsigned lvl, offset = 0; + +	next = base->clk + NEXT_TIMER_MAX_DELTA; +	clk = base->clk; +	for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { +		int pos = next_pending_bucket(base, offset, clk & LVL_MASK); + +		if (pos >= 0) { +			unsigned long tmp = clk + (unsigned long) pos; + +			tmp <<= LVL_SHIFT(lvl); +			if (time_before(tmp, next)) +				next = tmp;  		} -		slot = (slot + 1) & TVR_MASK; -	} while (slot != index); - -cascade: -	/* Calculate the next cascade event */ -	if (index) -		timer_jiffies += TVR_SIZE - index; -	timer_jiffies >>= TVR_BITS; - -	/* Check tv2-tv5. */ -	varray[0] = &base->tv2; -	varray[1] = &base->tv3; -	varray[2] = &base->tv4; -	varray[3] = &base->tv5; - -	for (array = 0; array < 4; array++) { -		struct tvec *varp = varray[array]; - -		index = slot = timer_jiffies & TVN_MASK; -		do { -			hlist_for_each_entry(nte, varp->vec + slot, entry) { -				if (nte->flags & TIMER_DEFERRABLE) -					continue; - -				found = 1; -				if (time_before(nte->expires, expires)) -					expires = nte->expires; -			} -			/* -			 * Do we still search for the first timer or are -			 * we looking up the cascade buckets ? -			 */ -			if (found) { -				/* Look at the cascade bucket(s)? */ -				if (!index || slot < index) -					break; -				return expires; -			} -			slot = (slot + 1) & TVN_MASK; -		} while (slot != index); - -		if (index) -			timer_jiffies += TVN_SIZE - index; -		timer_jiffies >>= TVN_BITS; +		/* +		 * Clock for the next level. If the current level clock lower +		 * bits are zero, we look at the next level as is. If not we +		 * need to advance it by one because that's going to be the +		 * next expiring bucket in that level. base->clk is the next +		 * expiring jiffie. So in case of: +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 +		 *  0    0    0    0    0    0 +		 * +		 * we have to look at all levels @index 0. With +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 +		 *  0    0    0    0    0    2 +		 * +		 * LVL0 has the next expiring bucket @index 2. The upper +		 * levels have the next expiring bucket @index 1. +		 * +		 * In case that the propagation wraps the next level the same +		 * rules apply: +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 +		 *  0    0    0    0    F    2 +		 * +		 * So after looking at LVL0 we get: +		 * +		 * LVL5 LVL4 LVL3 LVL2 LVL1 +		 *  0    0    0    1    0 +		 * +		 * So no propagation from LVL1 to LVL2 because that happened +		 * with the add already, but then we need to propagate further +		 * from LVL2 to LVL3. +		 * +		 * So the simple check whether the lower bits of the current +		 * level are 0 or not is sufficient for all cases. +		 */ +		adj = clk & LVL_CLK_MASK ? 1 : 0; +		clk >>= LVL_CLK_SHIFT; +		clk += adj;  	} -	return expires; +	return next;  }  /* @@ -1364,7 +1493,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)   */  u64 get_next_timer_interrupt(unsigned long basej, u64 basem)  { -	struct tvec_base *base = this_cpu_ptr(&tvec_bases); +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);  	u64 expires = KTIME_MAX;  	unsigned long nextevt; @@ -1376,19 +1505,80 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)  		return expires;  	spin_lock(&base->lock); -	if (base->active_timers) { -		if (time_before_eq(base->next_timer, base->timer_jiffies)) -			base->next_timer = __next_timer_interrupt(base); -		nextevt = base->next_timer; -		if (time_before_eq(nextevt, basej)) -			expires = basem; -		else -			expires = basem + (nextevt - basej) * TICK_NSEC; +	nextevt = __next_timer_interrupt(base); +	base->next_expiry = nextevt; +	/* +	 * We have a fresh next event. Check whether we can forward the base: +	 */ +	if (time_after(nextevt, jiffies)) +		base->clk = jiffies; +	else if (time_after(nextevt, base->clk)) +		base->clk = nextevt; + +	if (time_before_eq(nextevt, basej)) { +		expires = basem; +		base->is_idle = false; +	} else { +		expires = basem + (nextevt - basej) * TICK_NSEC; +		/* +		 * If we expect to sleep more than a tick, mark the base idle: +		 */ +		if ((expires - basem) > TICK_NSEC) +			base->is_idle = true;  	}  	spin_unlock(&base->lock);  	return cmp_next_hrtimer_event(basem, expires);  } + +/** + * timer_clear_idle - Clear the idle state of the timer base + * + * Called with interrupts disabled + */ +void timer_clear_idle(void) +{ +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +	/* +	 * We do this unlocked. The worst outcome is a remote enqueue sending +	 * a pointless IPI, but taking the lock would just make the window for +	 * sending the IPI a few instructions smaller for the cost of taking +	 * the lock in the exit from idle path. +	 */ +	base->is_idle = false; +} + +static int collect_expired_timers(struct timer_base *base, +				  struct hlist_head *heads) +{ +	/* +	 * NOHZ optimization. After a long idle sleep we need to forward the +	 * base to current jiffies. Avoid a loop by searching the bitfield for +	 * the next expiring timer. +	 */ +	if ((long)(jiffies - base->clk) > 2) { +		unsigned long next = __next_timer_interrupt(base); + +		/* +		 * If the next timer is ahead of time forward to current +		 * jiffies, otherwise forward to the next expiry time: +		 */ +		if (time_after(next, jiffies)) { +			/* The call site will increment clock! */ +			base->clk = jiffies - 1; +			return 0; +		} +		base->clk = next; +	} +	return __collect_expired_timers(base, heads); +} +#else +static inline int collect_expired_timers(struct timer_base *base, +					 struct hlist_head *heads) +{ +	return __collect_expired_timers(base, heads); +}  #endif  /* @@ -1411,15 +1601,42 @@ void update_process_times(int user_tick)  	run_posix_cpu_timers(p);  } +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + */ +static inline void __run_timers(struct timer_base *base) +{ +	struct hlist_head heads[LVL_DEPTH]; +	int levels; + +	if (!time_after_eq(jiffies, base->clk)) +		return; + +	spin_lock_irq(&base->lock); + +	while (time_after_eq(jiffies, base->clk)) { + +		levels = collect_expired_timers(base, heads); +		base->clk++; + +		while (levels--) +			expire_timers(base, heads + levels); +	} +	base->running_timer = NULL; +	spin_unlock_irq(&base->lock); +} +  /*   * This function runs timers and the timer-tq in bottom half context.   */  static void run_timer_softirq(struct softirq_action *h)  { -	struct tvec_base *base = this_cpu_ptr(&tvec_bases); +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); -	if (time_after_eq(jiffies, base->timer_jiffies)) -		__run_timers(base); +	__run_timers(base); +	if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) +		__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));  }  /* @@ -1427,7 +1644,18 @@ static void run_timer_softirq(struct softirq_action *h)   */  void run_local_timers(void)  { +	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); +  	hrtimer_run_queues(); +	/* Raise the softirq only if required. */ +	if (time_before(jiffies, base->clk)) { +		if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) +			return; +		/* CPU is awake, so check the deferrable base. */ +		base++; +		if (time_before(jiffies, base->clk)) +			return; +	}  	raise_softirq(TIMER_SOFTIRQ);  } @@ -1512,7 +1740,7 @@ signed long __sched schedule_timeout(signed long timeout)  	expire = timeout + jiffies;  	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); -	__mod_timer(&timer, expire, false, TIMER_NOT_PINNED); +	__mod_timer(&timer, expire, false);  	schedule();  	del_singleshot_timer_sync(&timer); @@ -1563,14 +1791,13 @@ signed long __sched schedule_timeout_idle(signed long timeout)  EXPORT_SYMBOL(schedule_timeout_idle);  #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) +static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)  {  	struct timer_list *timer;  	int cpu = new_base->cpu;  	while (!hlist_empty(head)) {  		timer = hlist_entry(head->first, struct timer_list, entry); -		/* We ignore the accounting on the dying cpu */  		detach_timer(timer, false);  		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;  		internal_add_timer(new_base, timer); @@ -1579,37 +1806,31 @@ static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *he  static void migrate_timers(int cpu)  { -	struct tvec_base *old_base; -	struct tvec_base *new_base; -	int i; +	struct timer_base *old_base; +	struct timer_base *new_base; +	int b, i;  	BUG_ON(cpu_online(cpu)); -	old_base = per_cpu_ptr(&tvec_bases, cpu); -	new_base = get_cpu_ptr(&tvec_bases); -	/* -	 * The caller is globally serialized and nobody else -	 * takes two locks at once, deadlock is not possible. -	 */ -	spin_lock_irq(&new_base->lock); -	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); - -	BUG_ON(old_base->running_timer); - -	for (i = 0; i < TVR_SIZE; i++) -		migrate_timer_list(new_base, old_base->tv1.vec + i); -	for (i = 0; i < TVN_SIZE; i++) { -		migrate_timer_list(new_base, old_base->tv2.vec + i); -		migrate_timer_list(new_base, old_base->tv3.vec + i); -		migrate_timer_list(new_base, old_base->tv4.vec + i); -		migrate_timer_list(new_base, old_base->tv5.vec + i); -	} -	old_base->active_timers = 0; -	old_base->all_timers = 0; +	for (b = 0; b < NR_BASES; b++) { +		old_base = per_cpu_ptr(&timer_bases[b], cpu); +		new_base = get_cpu_ptr(&timer_bases[b]); +		/* +		 * The caller is globally serialized and nobody else +		 * takes two locks at once, deadlock is not possible. +		 */ +		spin_lock_irq(&new_base->lock); +		spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + +		BUG_ON(old_base->running_timer); + +		for (i = 0; i < WHEEL_SIZE; i++) +			migrate_timer_list(new_base, old_base->vectors + i); -	spin_unlock(&old_base->lock); -	spin_unlock_irq(&new_base->lock); -	put_cpu_ptr(&tvec_bases); +		spin_unlock(&old_base->lock); +		spin_unlock_irq(&new_base->lock); +		put_cpu_ptr(&timer_bases); +	}  }  static int timer_cpu_notify(struct notifier_block *self, @@ -1637,13 +1858,15 @@ static inline void timer_register_cpu_notifier(void) { }  static void __init init_timer_cpu(int cpu)  { -	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); - -	base->cpu = cpu; -	spin_lock_init(&base->lock); +	struct timer_base *base; +	int i; -	base->timer_jiffies = jiffies; -	base->next_timer = base->timer_jiffies; +	for (i = 0; i < NR_BASES; i++) { +		base = per_cpu_ptr(&timer_bases[i], cpu); +		base->cpu = cpu; +		spin_lock_init(&base->lock); +		base->clk = jiffies; +	}  }  static void __init init_timer_cpus(void) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 780bcbe1d4de..26f603da7e26 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -198,7 +198,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)  	if (unlikely(index >= array->map.max_entries))  		return -E2BIG; -	file = (struct file *)array->ptrs[index]; +	file = READ_ONCE(array->ptrs[index]);  	if (unlikely(!file))  		return -ENOENT; @@ -209,6 +209,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)  	    event->pmu->count)  		return -EINVAL; +	if (unlikely(event->attr.type != PERF_TYPE_HARDWARE && +		     event->attr.type != PERF_TYPE_RAW)) +		return -EINVAL; +  	/*  	 * we don't know if the function is run successfully by the  	 * return value. It can be judged in other places, such as @@ -247,7 +251,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 flags, u64 r4, u64 size)  	if (unlikely(index >= array->map.max_entries))  		return -E2BIG; -	file = (struct file *)array->ptrs[index]; +	file = READ_ONCE(array->ptrs[index]);  	if (unlikely(!file))  		return -ENOENT; @@ -349,7 +353,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func  }  /* bpf+kprobe programs can access fields of 'struct pt_regs' */ -static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, +					enum bpf_reg_type *reg_type)  {  	/* check bounds */  	if (off < 0 || off >= sizeof(struct pt_regs)) @@ -427,7 +432,8 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)  	}  } -static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type) +static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, +				    enum bpf_reg_type *reg_type)  {  	if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)  		return false; diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index f96f0383f6c6..ad1d6164e946 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt {  static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)  {  	struct trace_bprintk_fmt *pos; + +	if (!fmt) +		return ERR_PTR(-EINVAL); +  	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {  		if (!strcmp(pos->fmt, fmt))  			return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)  	for (iter = start; iter < end; iter++) {  		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);  		if (tb_fmt) { -			*iter = tb_fmt->fmt; +			if (!IS_ERR(tb_fmt)) +				*iter = tb_fmt->fmt;  			continue;  		} | 
