/*
 * Copyright (c) 2017 Richard Braun.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 *
 * This implementation is based on the paper "Algorithms for Scalable
 * Synchronization on Shared-Memory Multiprocessors" by John M. Mellor-Crummey
 * and Michael L. Scott, which describes MCS locks, among other algorithms.
 *
 * In order to avoid the need to allocate a qnode for every spin lock
 * currently held, and also to keep the size of locks to a single 32-bits
 * word, this module actually uses a variant of the MCS locks. The
 * differences are presented below.
 *
 * First, the lock owner is never part of the lock queue. This makes it
 * possible to use a qnode only during the lock operation, not after.
 * This means a single qnode per execution context is required even when
 * holding multiple spin locks simultaneously. In order to achieve that,
 * a spin lock not only refers to the last waiter, but also to the first,
 * which is the owner's qnode->next in the original algorithm.
 *
 * Next, instead of two pointers, the lock is a single word storing
 * compressed references to both the first and last waiters. Those
 * references are integers called QIDs, for qnode IDs. They can be
 * broken down into 3 parts :
 *  - the lock bit
 *  - the execution context
 *  - the target CPU ID
 *
 * The layout of a QID is carefully crafted to match the lock values
 * expected by the fast paths. Without contention, a QID value must
 * be SPINLOCK_UNLOCKED when a lock isn't held, and SPINLOCK_LOCKED
 * when it is. Besides, without contention, the reference to the first
 * waiter is logically NULL so that the whole lock value matches one
 * of SPINLOCK_UNLOCKED or SPINLOCK_LOCKED. This means that the values
 * of the execution context and the CPU ID must be 0 in the absence of
 * contention. In the presence of contention, the execution context and
 * CPU ID are used to uniquely identify a statically allocated qnode,
 * and fast paths operations fail. The lock operation must make sure
 * that the lock value is restored to SPINLOCK_LOCKED if there is no
 * more contention, an operation called downgrading.
 *
 * In order to enforce correct visibility of the critical section,
 * acquire and release atomic operations are used when accessing a
 * qnode lock. Similarly, acquire and release semantics are also used
 * when accessing the lock word which stores the first and last QIDs,
 * so that the memory operations on the qnodes referenced by those QIDs
 * are correctly enforced. Accessing the next QID in a qnode however
 * is performed with no memory order constraint, because the qnodes
 * referenced by those QID are never accessed unless they are the first
 * or the last, cases which do enforce ordering.
 */

#include <assert.h>
#include <stdalign.h>
#include <stddef.h>

#include <kern/atomic.h>
#include <kern/error.h>
#include <kern/init.h>
#include <kern/macros.h>
#include <kern/percpu.h>
#include <kern/spinlock.h>
#include <kern/spinlock_i.h>
#include <kern/spinlock_types.h>
#include <kern/thread.h>
#include <machine/cpu.h>

#define SPINLOCK_QID_LOCK_BIT       1

#define SPINLOCK_QID_CTX_BITS       2
#define SPINLOCK_QID_CTX_SHIFT      SPINLOCK_QID_LOCK_BIT
#define SPINLOCK_QID_CTX_MAX        (1 << SPINLOCK_QID_CTX_BITS)
#define SPINLOCK_QID_CTX_MASK       (SPINLOCK_QID_CTX_MAX - 1)

#define SPINLOCK_QID_CPU_BITS       13
#define SPINLOCK_QID_CPU_SHIFT      (SPINLOCK_QID_CTX_BITS \
                                     + SPINLOCK_QID_CTX_SHIFT)
#define SPINLOCK_QID_CPU_MAX        (1 << SPINLOCK_QID_CPU_BITS)
#define SPINLOCK_QID_CPU_MASK       (SPINLOCK_QID_CPU_MAX - 1)

#define SPINLOCK_QID_BITS           (SPINLOCK_QID_CPU_BITS          \
                                     + SPINLOCK_QID_CTX_BITS        \
                                     + SPINLOCK_QID_LOCK_BIT)
#define SPINLOCK_QID_MAX            (1 << SPINLOCK_QID_BITS)
#define SPINLOCK_QID_MASK           (SPINLOCK_QID_MAX - 1)

#define SPINLOCK_QID_MAX_BITS       16

#define SPINLOCK_QID_NULL           SPINLOCK_UNLOCKED
#define SPINLOCK_QID_LOCKED         SPINLOCK_LOCKED

#if SPINLOCK_QID_BITS > SPINLOCK_QID_MAX_BITS
#error "spinlock qid too large"
#endif

#if CONFIG_MAX_CPUS > (1 << SPINLOCK_QID_CPU_BITS)
#error "maximum number of supported processors too large"
#endif

struct spinlock_qnode {
    unsigned int next_qid;
    bool locked;
};

#define SPINLOCK_CTX_INVALID    0
#define SPINLOCK_CTX_THREAD     1
#define SPINLOCK_CTX_INTR       2
#define SPINLOCK_CTX_NMI        3
#define SPINLOCK_NR_CTXS        4

#if SPINLOCK_CTX_INVALID != 0
#error "the invalid context value must be 0"
#endif

#if SPINLOCK_NR_CTXS > SPINLOCK_QID_CTX_MAX
#error "maximum number of contexts too large"
#endif

struct spinlock_cpu_data {
    alignas(CPU_L1_SIZE) struct spinlock_qnode qnodes[SPINLOCK_NR_CTXS - 1];
};

static struct spinlock_cpu_data spinlock_cpu_data __percpu;

void
spinlock_init(struct spinlock *lock)
{
    lock->value = SPINLOCK_UNLOCKED;

#ifdef SPINLOCK_TRACK_OWNER
    lock->owner = NULL;
#endif /* SPINLOCK_TRACK_OWNER */
}

static unsigned int
spinlock_qid2ctx(unsigned int qid)
{
    return (qid >> SPINLOCK_QID_CTX_SHIFT) & SPINLOCK_QID_CTX_MASK;
}

static unsigned int
spinlock_qid2cpu(unsigned int qid)
{
    return (qid >> SPINLOCK_QID_CPU_SHIFT) & SPINLOCK_QID_CPU_MASK;
}

static unsigned int
spinlock_build_qid(unsigned int cpu, unsigned int ctx)
{
    return (cpu << SPINLOCK_QID_CPU_SHIFT)
           | (ctx << SPINLOCK_QID_CTX_SHIFT)
           | SPINLOCK_QID_LOCK_BIT;
}

static void
spinlock_get_local_qnode(struct spinlock_qnode **qnodep, unsigned int *qidp)
{
    struct spinlock_cpu_data *cpu_data;
    unsigned int ctx;

    cpu_data = cpu_local_ptr(spinlock_cpu_data);

    /* TODO NMI support */
    ctx = thread_interrupted() ? SPINLOCK_CTX_INTR : SPINLOCK_CTX_THREAD;
    *qnodep = &cpu_data->qnodes[ctx - 1];
    *qidp = spinlock_build_qid(cpu_id(), ctx);
}

static struct spinlock_qnode *
spinlock_get_remote_qnode(unsigned int qid)
{
    struct spinlock_cpu_data *cpu_data;
    unsigned int ctx, cpu;

    ctx = spinlock_qid2ctx(qid);

    if (ctx == SPINLOCK_CTX_INVALID) {
        return NULL;
    }

    ctx--;
    assert(ctx < ARRAY_SIZE(cpu_data->qnodes));

    cpu = spinlock_qid2cpu(qid);
    cpu_data = percpu_ptr(spinlock_cpu_data, cpu);
    return &cpu_data->qnodes[ctx];
}

static void
spinlock_store_first_qid(struct spinlock *lock, unsigned int newqid)
{
    unsigned int oldval, newval, prev;

    assert(newqid < SPINLOCK_QID_MAX);

    newqid <<= SPINLOCK_QID_MAX_BITS;

    do {
        oldval = atomic_load(&lock->value, ATOMIC_RELAXED);
        newval = newqid | (oldval & SPINLOCK_QID_MASK);
        prev = atomic_cas_release(&lock->value, oldval, newval);
    } while (prev != oldval);
}

static unsigned int
spinlock_load_first_qid(const struct spinlock *lock)
{
    unsigned int value;

    value = atomic_load_acquire(&lock->value);
    return (value >> SPINLOCK_QID_MAX_BITS) & SPINLOCK_QID_MASK;
}

static unsigned int
spinlock_swap_last_qid(struct spinlock *lock, unsigned int newqid)
{
    unsigned int oldval, newval, prev;

    assert(newqid < SPINLOCK_QID_MAX);

    do {
        oldval = atomic_load(&lock->value, ATOMIC_RELAXED);
        newval = (oldval & (SPINLOCK_QID_MASK << SPINLOCK_QID_MAX_BITS))
                 | newqid;
        prev = atomic_cas_acq_rel(&lock->value, oldval, newval);
    } while (prev != oldval);

    return prev & SPINLOCK_QID_MASK;
}

static unsigned int
spinlock_try_downgrade(struct spinlock *lock, unsigned int oldqid)
{
    unsigned int prev;

    prev = atomic_cas(&lock->value, oldqid,
                      SPINLOCK_QID_LOCKED, ATOMIC_RELAXED);

    assert((prev >> SPINLOCK_QID_MAX_BITS) == 0);
    assert(prev != SPINLOCK_QID_NULL);

    if (prev != oldqid) {
        return ERROR_BUSY;
    }

    return 0;
}

void
spinlock_lock_slow(struct spinlock *lock)
{
    struct spinlock_qnode *qnode, *prev_qnode;
    unsigned int qid, prev_qid, next_qid, ctx;
    bool locked;
    int error;

    spinlock_get_local_qnode(&qnode, &qid);
    qnode->next_qid = SPINLOCK_QID_NULL;

    prev_qid = spinlock_swap_last_qid(lock, qid);

    if (prev_qid != SPINLOCK_QID_NULL) {
        qnode->locked = true;
        ctx = spinlock_qid2ctx(prev_qid);

        if (ctx == SPINLOCK_CTX_INVALID) {
            spinlock_store_first_qid(lock, qid);
        } else {
            prev_qnode = spinlock_get_remote_qnode(prev_qid);
            atomic_store(&prev_qnode->next_qid, qid, ATOMIC_RELAXED);
        }

        for (;;) {
            locked = atomic_load_acquire(&qnode->locked);

            if (!locked) {
                break;
            }

            cpu_pause();
        }

        spinlock_store_first_qid(lock, SPINLOCK_QID_NULL);
    }

    spinlock_own(lock);
    error = spinlock_try_downgrade(lock, qid);

    if (!error) {
        return;
    }

    for (;;) {
        next_qid = atomic_load(&qnode->next_qid, ATOMIC_RELAXED);

        if (next_qid != SPINLOCK_QID_NULL) {
            break;
        }

        cpu_pause();
    }

    spinlock_store_first_qid(lock, next_qid);
}

void
spinlock_unlock_slow(struct spinlock *lock)
{
    struct spinlock_qnode *next_qnode;
    unsigned int next_qid;

    for (;;) {
        next_qid = spinlock_load_first_qid(lock);

        if (next_qid != SPINLOCK_QID_NULL) {
            break;
        }

        cpu_pause();
    }

    next_qnode = spinlock_get_remote_qnode(next_qid);
    atomic_store_release(&next_qnode->locked, false);
}

static int __init
spinlock_setup(void)
{
    return 0;
}

INIT_OP_DEFINE(spinlock_setup,
               INIT_OP_DEP(thread_setup_booter, true));