diff options
Diffstat (limited to 'kernel/futex.c')
| -rw-r--r-- | kernel/futex.c | 139 | 
1 files changed, 113 insertions, 26 deletions
| diff --git a/kernel/futex.c b/kernel/futex.c index 5d6ce6413ef1..a5d2e74c89e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -124,16 +124,16 @@   *   futex_wait(futex, val);   *   *   waiters++; (a) - *   mb(); (A) <-- paired with -. - *                              | - *   lock(hash_bucket(futex));  | - *                              | - *   uval = *futex;             | - *                              |        *futex = newval; - *                              |        sys_futex(WAKE, futex); - *                              |          futex_wake(futex); - *                              | - *                              `------->  mb(); (B) + *   smp_mb(); (A) <-- paired with -. + *                                  | + *   lock(hash_bucket(futex));      | + *                                  | + *   uval = *futex;                 | + *                                  |        *futex = newval; + *                                  |        sys_futex(WAKE, futex); + *                                  |          futex_wake(futex); + *                                  | + *                                  `--------> smp_mb(); (B)   *   if (uval == val)   *     queue();   *     unlock(hash_bucket(futex)); @@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)  	/*  	 * Ensure futex_get_mm() implies a full barrier such that  	 * get_futex_key() implies a full barrier. This is relied upon -	 * as full barrier (B), see the ordering comment above. +	 * as smp_mb(); (B), see the ordering comment above.  	 */  	smp_mb__after_atomic();  } @@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)  	switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {  	case FUT_OFF_INODE: -		ihold(key->shared.inode); /* implies MB (B) */ +		ihold(key->shared.inode); /* implies smp_mb(); (B) */  		break;  	case FUT_OFF_MMSHARED: -		futex_get_mm(key); /* implies MB (B) */ +		futex_get_mm(key); /* implies smp_mb(); (B) */  		break;  	default:  		/* @@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)  		 * mm, therefore the only purpose of calling get_futex_key_refs  		 * is because we need the barrier for the lockless waiter check.  		 */ -		smp_mb(); /* explicit MB (B) */ +		smp_mb(); /* explicit smp_mb(); (B) */  	}  } @@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)  	if (!fshared) {  		key->private.mm = mm;  		key->private.address = address; -		get_futex_key_refs(key);  /* implies MB (B) */ +		get_futex_key_refs(key);  /* implies smp_mb(); (B) */  		return 0;  	} @@ -520,7 +520,20 @@ again:  	else  		err = 0; -	lock_page(page); +	/* +	 * The treatment of mapping from this point on is critical. The page +	 * lock protects many things but in this context the page lock +	 * stabilizes mapping, prevents inode freeing in the shared +	 * file-backed region case and guards against movement to swap cache. +	 * +	 * Strictly speaking the page lock is not needed in all cases being +	 * considered here and page lock forces unnecessarily serialization +	 * From this point on, mapping will be re-verified if necessary and +	 * page lock will be acquired only if it is unavoidable +	 */ +	page = compound_head(page); +	mapping = READ_ONCE(page->mapping); +  	/*  	 * If page->mapping is NULL, then it cannot be a PageAnon  	 * page; but it might be the ZERO_PAGE or in the gate area or @@ -536,19 +549,31 @@ again:  	 * shmem_writepage move it from filecache to swapcache beneath us:  	 * an unlikely race, but we do need to retry for page->mapping.  	 */ -	mapping = compound_head(page)->mapping; -	if (!mapping) { -		int shmem_swizzled = PageSwapCache(page); +	if (unlikely(!mapping)) { +		int shmem_swizzled; + +		/* +		 * Page lock is required to identify which special case above +		 * applies. If this is really a shmem page then the page lock +		 * will prevent unexpected transitions. +		 */ +		lock_page(page); +		shmem_swizzled = PageSwapCache(page) || page->mapping;  		unlock_page(page);  		put_page(page); +  		if (shmem_swizzled)  			goto again; +  		return -EFAULT;  	}  	/*  	 * Private mappings are handled in a simple way.  	 * +	 * If the futex key is stored on an anonymous page, then the associated +	 * object is the mm which is implicitly pinned by the calling process. +	 *  	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if  	 * it's a read-only handle, it's expected that futexes attach to  	 * the object not the particular process. @@ -566,16 +591,74 @@ again:  		key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */  		key->private.mm = mm;  		key->private.address = address; + +		get_futex_key_refs(key); /* implies smp_mb(); (B) */ +  	} else { +		struct inode *inode; + +		/* +		 * The associated futex object in this case is the inode and +		 * the page->mapping must be traversed. Ordinarily this should +		 * be stabilised under page lock but it's not strictly +		 * necessary in this case as we just want to pin the inode, not +		 * update the radix tree or anything like that. +		 * +		 * The RCU read lock is taken as the inode is finally freed +		 * under RCU. If the mapping still matches expectations then the +		 * mapping->host can be safely accessed as being a valid inode. +		 */ +		rcu_read_lock(); + +		if (READ_ONCE(page->mapping) != mapping) { +			rcu_read_unlock(); +			put_page(page); + +			goto again; +		} + +		inode = READ_ONCE(mapping->host); +		if (!inode) { +			rcu_read_unlock(); +			put_page(page); + +			goto again; +		} + +		/* +		 * Take a reference unless it is about to be freed. Previously +		 * this reference was taken by ihold under the page lock +		 * pinning the inode in place so i_lock was unnecessary. The +		 * only way for this check to fail is if the inode was +		 * truncated in parallel so warn for now if this happens. +		 * +		 * We are not calling into get_futex_key_refs() in file-backed +		 * cases, therefore a successful atomic_inc return below will +		 * guarantee that get_futex_key() will still imply smp_mb(); (B). +		 */ +		if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { +			rcu_read_unlock(); +			put_page(page); + +			goto again; +		} + +		/* Should be impossible but lets be paranoid for now */ +		if (WARN_ON_ONCE(inode->i_mapping != mapping)) { +			err = -EFAULT; +			rcu_read_unlock(); +			iput(inode); + +			goto out; +		} +  		key->both.offset |= FUT_OFF_INODE; /* inode-based key */ -		key->shared.inode = mapping->host; +		key->shared.inode = inode;  		key->shared.pgoff = basepage_index(page); +		rcu_read_unlock();  	} -	get_futex_key_refs(key); /* implies MB (B) */ -  out: -	unlock_page(page);  	put_page(page);  	return err;  } @@ -1864,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)  	q->lock_ptr = &hb->lock; -	spin_lock(&hb->lock); /* implies MB (A) */ +	spin_lock(&hb->lock); /* implies smp_mb(); (A) */  	return hb;  } @@ -1927,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)  	/* In the common case we don't take the spinlock, which is nice. */  retry: -	lock_ptr = q->lock_ptr; -	barrier(); +	/* +	 * q->lock_ptr can change between this read and the following spin_lock. +	 * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and +	 * optimizing lock_ptr out of the logic below. +	 */ +	lock_ptr = READ_ONCE(q->lock_ptr);  	if (lock_ptr != NULL) {  		spin_lock(lock_ptr);  		/* | 
