summaryrefslogtreecommitdiff
path: root/nptl/pthread_barrier_wait.c
diff options
context:
space:
mode:
Diffstat (limited to 'nptl/pthread_barrier_wait.c')
-rw-r--r--nptl/pthread_barrier_wait.c234
1 files changed, 188 insertions, 46 deletions
diff --git a/nptl/pthread_barrier_wait.c b/nptl/pthread_barrier_wait.c
index 2b34e3097b..eabb5099d3 100644
--- a/nptl/pthread_barrier_wait.c
+++ b/nptl/pthread_barrier_wait.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2003-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2003-2016 Free Software Foundation, Inc.
This file is part of the GNU C Library.
Contributed by Martin Schwidefsky <schwidefsky@de.ibm.com>, 2003.
@@ -18,64 +18,206 @@
#include <errno.h>
#include <sysdep.h>
-#include <lowlevellock.h>
#include <futex-internal.h>
#include <pthreadP.h>
-/* Wait on barrier. */
+/* Wait on the barrier.
+
+ In each round, we wait for a fixed number of threads to enter the barrier
+ (COUNT). Once that has happened, exactly these threads are allowed to
+ leave the barrier. Note that POSIX does not require that only COUNT
+ threads can attempt to block using the barrier concurrently.
+
+ We count the number of threads that have entered (IN). Each thread
+ increments IN when entering, thus getting a position in the sequence of
+ threads that are or have been waiting (starting with 1, so the position
+ is the number of threads that have entered so far including the current
+ thread).
+ CURRENT_ROUND designates the most recent thread whose round has been
+ detected as complete. When a thread detects that enough threads have
+ entered to make a round complete, it finishes this round by effectively
+ adding COUNT to CURRENT_ROUND atomically. Threads that believe that their
+ round is not complete yet wait until CURRENT_ROUND is not smaller than
+ their position anymore.
+
+ A barrier can be destroyed as soon as no threads are blocked on the
+ barrier. This is already the case if just one thread from the last round
+ has stopped waiting and returned to the caller; the assumption is that
+ all threads from the round are unblocked atomically, even though they may
+ return at different times from the respective calls to
+ pthread_barrier_wait). Thus, a valid call to pthread_barrier_destroy can
+ be concurrent with other threads still figuring out that their round has
+ been completed. Therefore, threads need to confirm that they have left
+ the barrier by incrementing OUT, and pthread_barrier_destroy needs to wait
+ until OUT equals IN.
+
+ To avoid an ABA issue for futex_wait on CURRENT_ROUND and for archs with
+ 32b-only atomics, we additionally reset the barrier when IN reaches
+ a threshold to avoid overflow. We assume that the total number of threads
+ is less than UINT_MAX/2, and set the threshold accordingly so that we can
+ use a simple atomic_fetch_add on IN instead of a CAS when entering. The
+ threshold is always set to the end of a round, so all threads that have
+ entered are either pre-reset threads or post-reset threads (i.e., have a
+ position larger than the threshold).
+ Pre-reset threads just run the algorithm explained above. Post-reset
+ threads wait until IN is reset to a pre-threshold value.
+ When the last pre-reset thread leaves the barrier (i.e., OUT equals the
+ threshold), it resets the barrier to its initial state. Other (post-reset)
+ threads wait for the reset to have finished by waiting until IN is less
+ than the threshold and then restart by trying to enter the barrier again.
+
+ We reuse the reset mechanism in pthread_barrier_destroy to get notified
+ when all threads have left the barrier: We trigger an artificial reset and
+ wait for the last pre-reset thread to finish reset, thus notifying the
+ thread that is about to destroy the barrier.
+
+ Blocking using futexes is straightforward: pre-reset threads wait for
+ completion of their round using CURRENT_ROUND as futex word, and post-reset
+ threads and pthread_barrier_destroy use IN as futex word.
+
+ Further notes:
+ * It is not simple to let some of the post-reset threads help with the
+ reset because of the ABA issues that arise; therefore, we simply make
+ the last thread to leave responsible for the reset.
+ * POSIX leaves it unspecified whether a signal handler running in a thread
+ that has been unblocked (because its round is complete) can stall all
+ other threads and prevent them from returning from the barrier. In this
+ implementation, other threads will return. However,
+ pthread_barrier_destroy will of course wait for the signal handler thread
+ to confirm that it left the barrier.
+
+ TODO We should add spinning with back-off. Once we do that, we could also
+ try to avoid the futex_wake syscall when a round is detected as finished.
+ If we do not spin, it is quite likely that at least some other threads will
+ have called futex_wait already. */
int
-__pthread_barrier_wait (barrier)
- pthread_barrier_t *barrier;
+__pthread_barrier_wait (pthread_barrier_t *barrier)
{
- struct pthread_barrier *ibarrier = (struct pthread_barrier *) barrier;
- int result = 0;
- int lll_private = ibarrier->private ^ FUTEX_PRIVATE_FLAG;
- int futex_private = (lll_private == LLL_PRIVATE
- ? FUTEX_PRIVATE : FUTEX_SHARED);
-
- /* Make sure we are alone. */
- lll_lock (ibarrier->lock, lll_private);
-
- /* One more arrival. */
- --ibarrier->left;
-
- /* Are these all? */
- if (ibarrier->left == 0)
+ struct pthread_barrier *bar = (struct pthread_barrier *) barrier;
+
+ /* How many threads entered so far, including ourself. */
+ unsigned int i;
+
+ reset_restart:
+ /* Try to enter the barrier. We need acquire MO to (1) ensure that if we
+ observe that our round can be completed (see below for our attempt to do
+ so), all pre-barrier-entry effects of all threads in our round happen
+ before us completing the round, and (2) to make our use of the barrier
+ happen after a potential reset. We need release MO to make sure that our
+ pre-barrier-entry effects happen before threads in this round leaving the
+ barrier. */
+ i = atomic_fetch_add_acq_rel (&bar->in, 1) + 1;
+ /* These loads are after the fetch_add so that we're less likely to first
+ pull in the cache line as shared. */
+ unsigned int count = bar->count;
+ /* This is the number of threads that can enter before we need to reset.
+ Always at the end of a round. */
+ unsigned int max_in_before_reset = BARRIER_IN_THRESHOLD
+ - BARRIER_IN_THRESHOLD % count;
+
+ if (i > max_in_before_reset)
{
- /* Yes. Increment the event counter to avoid invalid wake-ups and
- tell the current waiters that it is their turn. */
- ++ibarrier->curr_event;
-
- /* Wake up everybody. */
- futex_wake (&ibarrier->curr_event, INT_MAX, futex_private);
-
- /* This is the thread which finished the serialization. */
- result = PTHREAD_BARRIER_SERIAL_THREAD;
+ /* We're in a reset round. Just wait for a reset to finish; do not
+ help finishing previous rounds because this could happen
+ concurrently with a reset. */
+ while (i > max_in_before_reset)
+ {
+ futex_wait_simple (&bar->in, i, bar->shared);
+ /* Relaxed MO is fine here because we just need an indication for
+ when we should retry to enter (which will use acquire MO, see
+ above). */
+ i = atomic_load_relaxed (&bar->in);
+ }
+ goto reset_restart;
}
- else
- {
- /* The number of the event we are waiting for. The barrier's event
- number must be bumped before we continue. */
- unsigned int event = ibarrier->curr_event;
- /* Before suspending, make the barrier available to others. */
- lll_unlock (ibarrier->lock, lll_private);
+ /* Look at the current round. At this point, we are just interested in
+ whether we can complete rounds, based on the information we obtained
+ through our acquire-MO load of IN. Nonetheless, if we notice that
+ our round has been completed using this load, we use the acquire-MO
+ fence below to make sure that all pre-barrier-entry effects of all
+ threads in our round happen before us leaving the barrier. Therefore,
+ relaxed MO is sufficient. */
+ unsigned cr = atomic_load_relaxed (&bar->current_round);
+
+ /* Try to finish previous rounds and/or the current round. We simply
+ consider just our position here and do not try to do the work of threads
+ that entered more recently. */
+ while (cr + count <= i)
+ {
+ /* Calculate the new current round based on how many threads entered.
+ NEWCR must be larger than CR because CR+COUNT ends a round. */
+ unsigned int newcr = i - i % count;
+ /* Try to complete previous and/or the current round. We need release
+ MO to propagate the happens-before that we observed through reading
+ with acquire MO from IN to other threads. If the CAS fails, it
+ is like the relaxed-MO load of CURRENT_ROUND above. */
+ if (atomic_compare_exchange_weak_release (&bar->current_round, &cr,
+ newcr))
+ {
+ /* Update CR with the modification we just did. */
+ cr = newcr;
+ /* Wake threads belonging to the rounds we just finished. We may
+ wake more threads than necessary if more than COUNT threads try
+ to block concurrently on the barrier, but this is not a typical
+ use of barriers.
+ Note that we can still access SHARED because we haven't yet
+ confirmed to have left the barrier. */
+ futex_wake (&bar->current_round, INT_MAX, bar->shared);
+ /* We did as much as we could based on our position. If we advanced
+ the current round to a round sufficient for us, do not wait for
+ that to happen and skip the acquire fence (we already
+ synchronize-with all other threads in our round through the
+ initial acquire MO fetch_add of IN. */
+ if (i <= cr)
+ goto ready_to_leave;
+ else
+ break;
+ }
+ }
- /* Wait for the event counter of the barrier to change. */
- do
- futex_wait_simple (&ibarrier->curr_event, event, futex_private);
- while (event == ibarrier->curr_event);
+ /* Wait until the current round is more recent than the round we are in. */
+ while (i > cr)
+ {
+ /* Wait for the current round to finish. */
+ futex_wait_simple (&bar->current_round, cr, bar->shared);
+ /* See the fence below. */
+ cr = atomic_load_relaxed (&bar->current_round);
}
- /* Make sure the init_count is stored locally or in a register. */
- unsigned int init_count = ibarrier->init_count;
+ /* Our round finished. Use the acquire MO fence to synchronize-with the
+ thread that finished the round, either through the initial load of
+ CURRENT_ROUND above or a failed CAS in the loop above. */
+ atomic_thread_fence_acquire ();
+
+ /* Now signal that we left. */
+ unsigned int o;
+ ready_to_leave:
+ /* We need release MO here so that our use of the barrier happens before
+ reset or memory reuse after pthread_barrier_destroy. */
+ o = atomic_fetch_add_release (&bar->out, 1) + 1;
+ if (o == max_in_before_reset)
+ {
+ /* Perform a reset if we are the last pre-reset thread leaving. All
+ other threads accessing the barrier are post-reset threads and are
+ incrementing or spinning on IN. Thus, resetting IN as the last step
+ of reset ensures that the reset is not concurrent with actual use of
+ the barrier. We need the acquire MO fence so that the reset happens
+ after use of the barrier by all earlier pre-reset threads. */
+ atomic_thread_fence_acquire ();
+ atomic_store_relaxed (&bar->current_round, 0);
+ atomic_store_relaxed (&bar->out, 0);
+ /* When destroying the barrier, we wait for a reset to happen. Thus,
+ we must load SHARED now so that this happens before the barrier is
+ destroyed. */
+ int shared = bar->shared;
+ atomic_store_release (&bar->in, 0);
+ futex_wake (&bar->in, INT_MAX, shared);
- /* If this was the last woken thread, unlock. */
- if (atomic_increment_val (&ibarrier->left) == init_count)
- /* We are done. */
- lll_unlock (ibarrier->lock, lll_private);
+ }
- return result;
+ /* Return a special value for exactly one thread per round. */
+ return i % count == 0 ? PTHREAD_BARRIER_SERIAL_THREAD : 0;
}
weak_alias (__pthread_barrier_wait, pthread_barrier_wait)