pthread.c - public/m5threads - Git at Google

 /*
     m5threads, a pthread library for the M5 simulator
     Copyright (C) 2009, Stanford University

     This library is free software; you can redistribute it and/or
     modify it under the terms of the GNU Lesser General Public
     License as published by the Free Software Foundation; either
     version 2.1 of the License, or (at your option) any later version.

     This library is distributed in the hope that it will be useful,
     but WITHOUT ANY WARRANTY; without even the implied warranty of
     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     Lesser General Public License for more details.

     You should have received a copy of the GNU Lesser General Public
     License along with this library; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA

     Author: Daniel Sanchez
 */

 #include <unistd.h>
 #include <assert.h>
 #include <pthread.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <signal.h>
 #include <sys/errno.h>
 #include <sched.h>
 #include <sys/mman.h>
 #include <string.h>
 #include <malloc.h>
 #include <sys/syscall.h>

 //Spinlock assembly
 #if defined(__x86) || defined(__x86_64)
   #include "spinlock_x86.h"
 #elif defined(__alpha)
   #include "spinlock_alpha.h"
 #elif defined(__sparc)
   #include "spinlock_sparc.h"
 #elif defined (__arm__)
   #include "spinlock_arm.h"
 #else
   #error "spinlock routines not available for your arch!\n"
 #endif

 #include "pthread_defs.h"
 #include "tls_defs.h"
 #include "profiling_hooks.h"

 #define restrict

 //64KB stack, change to your taste...
 #define CHILD_STACK_BITS 16
 #define CHILD_STACK_SIZE (1 << CHILD_STACK_BITS)

 //Debug macro
 #ifdef __DEBUG
   #define DEBUG(args...) printf(args)
 #else
   #define DEBUG(args...)
 #endif

 //Size and alignment requirements of "real" (NPTL/LinuxThreads) thread control block
 #define NPTL_TCB_SIZE 1184 // sizeof (struct pthread)
 #define NPTL_TCB_ALIGN sizeof(double)
 #define NPTL_TCBHEAD_T_SIZE (sizeof(tcbhead_t))

 //Thread control structure
 typedef struct {
   pthread_t tid;
   unsigned int is_detached; //0 if joinable, 1 if detached
   volatile int child_finished;
   void* result; //written by child on exit
   void *(*start_routine)(void*);
   void* arg;
   //thread block limits
   void* tls_start_addr;
   void* stack_start_addr;
 } pthread_tcb_t;


 //Information about the thread block (TLS, sizes)
 static struct {
   size_t tls_memsz;
   size_t tls_filesz;
   void*  tls_initimage;
   size_t tls_align;
   size_t total_size;
   size_t stack_guard_size;
 } thread_block_info;


 /* Thread-local data */

 //Pointer to our TCB (NULL for main thread)
 __thread pthread_tcb_t* __tcb;

 // Used for TSD (getspecific, setspecific, etc.)
 __thread void** pthread_specifics = NULL; //dynamically allocated, since this is rarely used
 __thread uint32_t pthread_specifics_size = 0;


 /* Initialization, create/exit/join functions */

 // Search ELF segments, pull out TLS block info, campute thread block sizes
 static void populate_thread_block_info() {
   ElfW(Phdr) *phdr;

   //If there is no TLS segment...
   thread_block_info.tls_memsz = 0;
   thread_block_info.tls_filesz = 0;
   thread_block_info.tls_initimage = NULL;
   thread_block_info.tls_align = 0;

   /* Look through the TLS segment if there is any.  */
   if (_dl_phdr != NULL) {
     for (phdr = _dl_phdr; phdr < &_dl_phdr[_dl_phnum]; ++phdr) {
       if (phdr->p_type == PT_TLS) {
           /* Gather the values we need.  */
           thread_block_info.tls_memsz = phdr->p_memsz;
           thread_block_info.tls_filesz = phdr->p_filesz;
           thread_block_info.tls_initimage = (void *) phdr->p_vaddr;
           thread_block_info.tls_align = phdr->p_align;
           break;
       }
     }
   }

   //Set a stack guard size
   //In SPARC, this is actually needed to avoid out-of-range accesses on register saves...
   //Largest I have seen is 2048 (sparc64)
   //You could avoid this in theory by compiling with -mnostack-bias
   thread_block_info.stack_guard_size = 2048;

   //Total thread block size -- this is what we'll request to mmap
   #if TLS_TCB_AT_TP
   size_t sz = sizeof(pthread_tcb_t) + thread_block_info.tls_memsz + NPTL_TCBHEAD_T_SIZE + thread_block_info.stack_guard_size + CHILD_STACK_SIZE;
   #elif TLS_DTV_AT_TP
   size_t sz = sizeof(pthread_tcb_t) + thread_block_info.tls_memsz + NPTL_TCB_SIZE + NPTL_TCBHEAD_T_SIZE + thread_block_info.stack_guard_size + CHILD_STACK_SIZE;
   #else
   #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
   #endif
   //Note that TCB_SIZE is the "real" TCB size, not ours, which we leave zeroed (but some variables, notably errno, are somewhere inside there)

   //Align to multiple of CHILD_STACK_SIZE
   sz += CHILD_STACK_SIZE - 1;
   thread_block_info.total_size = (sz>>CHILD_STACK_BITS)<<CHILD_STACK_BITS;
 }

 //Set up TLS block in current thread
 // @param th_block_addr:  beginning of entire thread memory space
 static void setup_thread_tls(void* th_block_addr) {
   size_t tcb_offset = 0;
   void *tlsblock = NULL;
   char *tls_start_ptr = NULL;

   #if TLS_DTV_AT_TP
   th_block_addr += NPTL_TCB_SIZE;
   #endif

   /* Compute the (real) TCB offset */
   #if TLS_DTV_AT_TP
   tcb_offset = roundup(NPTL_TCBHEAD_T_SIZE, NPTL_TCB_ALIGN);
   #elif TLS_TCB_AT_TP
   tcb_offset = roundup(thread_block_info.tls_memsz, NPTL_TCB_ALIGN);
   #else
   #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
   #endif

   /* Align the TLS block.  */
   tlsblock = (void *) (((uintptr_t) th_block_addr + thread_block_info.tls_align - 1)
                        & ~(thread_block_info.tls_align - 1));
   /* Initialize the TLS block.  */
   #if TLS_DTV_AT_TP
   tls_start_ptr = ((char *) tlsblock + tcb_offset);
   #elif TLS_TCB_AT_TP
   tls_start_ptr = ((char *) tlsblock + tcb_offset
                        - roundup (thread_block_info.tls_memsz, thread_block_info.tls_align ?: 1));
   #else
   #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
   #endif

   //DEBUG("Init TLS: Copying %d bytes from 0x%llx to 0x%llx\n", filesz, (uint64_t) initimage, (uint64_t) tls_start_ptr);
   memcpy (tls_start_ptr, thread_block_info.tls_initimage, thread_block_info.tls_filesz);

   //Rest of tls vars are already cleared (mmap returns zeroed memory)

   //Note: We don't care about DTV pointers for x86/SPARC -- they're never used in static mode
   /* Initialize the thread pointer.  */
   #if TLS_DTV_AT_TP
   TLS_INIT_TP (tlsblock, 0);
   #elif TLS_TCB_AT_TP
   TLS_INIT_TP ((char *) tlsblock + tcb_offset, 0);
   #else
   #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
   #endif
 }

 //Some NPTL definitions
 int __libc_multiple_threads; //set to one on initialization
 int __nptl_nthreads = 32; //TODO: we don't really know...

 //Called at initialization. Sets up TLS for the main thread and populates thread_block_info, used in subsequent calls
 //Works with LinuxThreads and NPTL
 void __pthread_initialize_minimal() {
   __libc_multiple_threads = 1; //tell libc we're multithreaded (NPTL-specific)
   populate_thread_block_info();
   void* ptr = mmap(0, thread_block_info.total_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   setup_thread_tls(ptr + sizeof(pthread_tcb_t));
 }


 //Used by pthread_create to spawn child
 static int __pthread_trampoline(void* thr_ctrl) {
   //Set TLS up
   pthread_tcb_t* tcb = (pthread_tcb_t*) thr_ctrl;
   setup_thread_tls(tcb->tls_start_addr);
   __tcb = tcb;
   DEBUG("Child in trampoline, TID=%llx\n", tcb->tid);

   void* result = tcb->start_routine(tcb->arg);
   pthread_exit(result);
   assert(0); //should never be reached
 }

 int pthread_create (pthread_t* thread,
                     const pthread_attr_t* attr,
                     void *(*start_routine)(void*),
                     void* arg) {
   DEBUG("pthread_create: start\n");

   //Allocate the child thread block (TCB+TLS+stack area)
   //We use mmap so that the child can munmap it at exit without using a stack (it's a system call)
   void* thread_block;
   size_t thread_block_size = thread_block_info.total_size;
   thread_block = mmap(0, thread_block_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
   DEBUG("pthread_create: mmapped child thread block 0x%llx -- 0x%llx\n", thread_block, ((char*)thread_block) + CHILD_STACK_SIZE) ;

   //Populate the thread control block
   pthread_tcb_t* tcb = (pthread_tcb_t*) thread_block;
   tcb->tid = (pthread_t) thread_block; //thread ID is tcb address itself
   tcb->is_detached = 0; //joinable
   tcb->child_finished = 0;
   tcb->start_routine = start_routine;
   tcb->arg = arg;
   tcb->tls_start_addr = (void*)(((char*)thread_block) + sizeof(pthread_tcb_t)); //right after m5's tcb
   tcb->stack_start_addr = (void*) (((char*) thread_block) + thread_block_size - thread_block_info.stack_guard_size); //end of thread_block

   *thread=(pthread_t) thread_block;

   //Call clone()
   DEBUG("pthread_create: prior to clone()\n");
   clone(__pthread_trampoline, tcb->stack_start_addr, CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD, tcb);
   DEBUG("pthread_create: after clone()\n");
   return 0;
 }

 pthread_t pthread_self() {
     if (__tcb == NULL) return 0; //main thread
     return __tcb->tid;
 }

 int pthread_join (pthread_t thread, void** status) {
     DEBUG("pthread_join: started\n");
     pthread_tcb_t* child_tcb = (pthread_tcb_t*) thread;
     assert(child_tcb->tid == thread); // checks that this is really a tcb
     assert(!child_tcb->is_detached); // thread should be joinable
     volatile int child_done = 0;
     while (child_done == 0) { // spin until child done
         child_done = child_tcb->child_finished;
     }
     DEBUG("pthread_join: child joined\n");
     //Get result
     if (status) *status = child_tcb->result;

     //Deallocate child block
     //munmap(child_tcb, thread_block_info.total_size);

     return 0;

 }


 void pthread_exit (void* status) {
     // TODO: The good way to solve this is to have the child, not its parent, free
     // its own stack (and TLS segment). This enables detached threads. But to do this
     // you need an extra stack. A way to do this is to have a global, lock-protected
     // manager stack, or have the M5 exit system call do it... Anyhow, I'm deferring
     // this problem until we have TLS.

     //From point (XXX)  on, the thread **does not exist**,
     //as its parent may have already freed the stack.
     //So we must call sys_exit without using the stack => asm

     // NOTE: You may be tempted to call exit(0) or _exit(0) here, but there call exit_group,
     // killing the whole process and not just the current thread

     //If the keys array was allocated, free it
     if (pthread_specifics != NULL) free(pthread_specifics);

     //Main thread
     if (__tcb == NULL) _exit(0);

     DEBUG("Child TID=0x%llx in pthread_exit...\n", pthread_self() );
     __tcb->result = status;
     //TODO mem barrier here...
     __tcb->child_finished = 1;
     //XXX
     syscall(__NR_exit,0);
     assert(0); //should never be reached

 /*#if defined(__x86) or defined(__x86_64)
     __asm__ __volatile__  (
          "\nmov  $0x3c,%%eax\n\t" \
          "syscall\n\t"
          ::: "eax");
 #elif defined(__alpha)
     __asm__ __volatile__  (
          "\nldi  $0,1\n\t" \
          "callsys\n\t");
 #elif defined(__sparc)
     // Since this part of the code is provisional, don't bother with asm for now
     syscall(__NR_exit,0);
 #else
     #error "No pthread_exit asm for your arch, sorry!\n"
 #endif

     assert(0);*/
 }


 // mutex functions

 int pthread_mutex_init (pthread_mutex_t* mutex, const pthread_mutexattr_t* attr) {
   DEBUG("%s: start\n", __FUNCTION__);
     mutex->PTHREAD_MUTEX_T_COUNT = 0;
     return 0;
 }

 int pthread_mutex_lock (pthread_mutex_t* lock) {
   DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_LOCK_START(lock);
     spin_lock((int*)&lock->PTHREAD_MUTEX_T_COUNT);
     PROFILE_LOCK_END(lock);
     return 0;
 }

 int pthread_mutex_unlock (pthread_mutex_t* lock) {
   DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_UNLOCK_START(lock);
     spin_unlock((int*)&lock->PTHREAD_MUTEX_T_COUNT);
     PROFILE_UNLOCK_END(lock);
     return 0;
 }

 int pthread_mutex_destroy (pthread_mutex_t* mutex) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_mutex_trylock (pthread_mutex_t* mutex) {
   DEBUG("%s: start\n", __FUNCTION__);
     int acquired = trylock((int*)&mutex->PTHREAD_MUTEX_T_COUNT);
     if (acquired == 1) {
 	//Profiling not really accurate here...
 	PROFILE_LOCK_START(mutex);
 	PROFILE_LOCK_END(mutex);
         return 0;
     }
     return EBUSY;
 }

 // rwlock functions

 int pthread_rwlock_init (pthread_rwlock_t* lock, const pthread_rwlockattr_t* attr) {
   DEBUG("%s: start\n", __FUNCTION__);
     PTHREAD_RWLOCK_T_LOCK(lock) = 0; // used only with spin_lock, so we know to initilize to zero
     PTHREAD_RWLOCK_T_READERS(lock) = 0;
     PTHREAD_RWLOCK_T_WRITER(lock) = -1; // -1 means no one owns the write lock

     return 0;
 }

 int pthread_rwlock_destroy (pthread_rwlock_t* lock) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_rwlock_rdlock (pthread_rwlock_t* lock) {
   DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_LOCK_START(lock);
     do {
         // this is to reduce the contention and a possible live-lock to lock->access_lock
         while (1) {
             pthread_t writer = PTHREAD_RWLOCK_T_WRITER(lock);
             if (writer == -1) {
                 break;
             }
         }

         spin_lock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
         if ((pthread_t)PTHREAD_RWLOCK_T_WRITER(lock) == -1) {
             PTHREAD_RWLOCK_T_READERS(lock)++;
             spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
 	    PROFILE_LOCK_END(lock);
             return 0;
         }
         spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     } while (1);
     PROFILE_LOCK_END(lock);
     return 0;
 }

 int pthread_rwlock_wrlock (pthread_rwlock_t* lock) {
   DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_LOCK_START(lock);
     do {
         while (1) {
             pthread_t writer = PTHREAD_RWLOCK_T_WRITER(lock);
             if (writer == -1) {
                 break;
             }
             int num_readers = PTHREAD_RWLOCK_T_READERS(lock);
             if (num_readers == 0) {
                 break;
             }
         }

         spin_lock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
         if ((pthread_t)PTHREAD_RWLOCK_T_WRITER(lock) == -1 && PTHREAD_RWLOCK_T_READERS(lock) == 0) {
             PTHREAD_RWLOCK_T_WRITER(lock) = pthread_self();
             spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
 	    PROFILE_LOCK_END(lock);
             return 0;
         }
         spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     } while (1);
     PROFILE_LOCK_END(lock);
     return 0;
 }

 int pthread_rwlock_unlock (pthread_rwlock_t* lock) {
   DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_UNLOCK_START(lock);
     spin_lock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     if (pthread_self() == PTHREAD_RWLOCK_T_WRITER(lock)) {
         // the write lock will be released
         PTHREAD_RWLOCK_T_WRITER(lock) = -1;
     } else {
         // one of the read locks will be released
         PTHREAD_RWLOCK_T_READERS(lock) = PTHREAD_RWLOCK_T_READERS(lock) - 1;
     }
     spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     PROFILE_UNLOCK_END(lock);
     return 0;
 }


 // key functions
 #ifndef PTHREAD_KEYS_MAX
 #define PTHREAD_KEYS_MAX 1024
 #endif

 typedef struct {
   int in_use;
   void (*destr)(void*);
 } pthread_key_struct;

 static pthread_key_struct pthread_keys[PTHREAD_KEYS_MAX];
 static pthread_mutex_t pthread_keys_mutex = PTHREAD_MUTEX_INITIALIZER;

 int pthread_key_create (pthread_key_t* key, void (*destructor)(void*)) {
   int i;
   DEBUG("%s: start\n", __FUNCTION__);

   pthread_mutex_lock(&pthread_keys_mutex);
   for (i = 0; i < PTHREAD_KEYS_MAX; i++) {
     if (! pthread_keys[i].in_use) {
       /* Mark key in use */
       pthread_keys[i].in_use = 1;
       pthread_keys[i].destr = destructor;
       pthread_mutex_unlock(&pthread_keys_mutex);
       *key = i;
       return 0;
     }
   }
   pthread_mutex_unlock(&pthread_keys_mutex);
   return EAGAIN;
 }

 int pthread_key_delete (pthread_key_t key)
 {
   DEBUG("%s: start\n", __FUNCTION__);
   pthread_mutex_lock(&pthread_keys_mutex);
   if (key >= PTHREAD_KEYS_MAX || !pthread_keys[key].in_use) {
     pthread_mutex_unlock(&pthread_keys_mutex);
     return EINVAL;
   }
   pthread_keys[key].in_use = 0;
   pthread_keys[key].destr = NULL;

   /* NOTE: The LinuxThreads implementation actually zeroes deleted keys on
      spawned threads. I don't care, the spec says that if you are  access a
      key after if has been deleted, you're on your own. */

   pthread_mutex_unlock(&pthread_keys_mutex);
   return 0;
 }

 int pthread_setspecific (pthread_key_t key, const void* value) {
   int m_size;
   DEBUG("%s: start\n", __FUNCTION__);
   if (key < 0 || key >= PTHREAD_KEYS_MAX) return EINVAL;
   if (pthread_specifics_size == 0) {
      pthread_specifics = (void**) calloc(PTHREAD_KEYS_MAX + 1, sizeof(void*));
      DEBUG("pthread_setspecific: malloc of size %d bytes, got 0x%llx\n", m_size, pthread_specifics);
      pthread_specifics_size = key+1;
   }
   pthread_specifics[key] = (void*) value;
   return 0;
 }

 void* pthread_getspecific (pthread_key_t key) {
   if (key < 0 || key >= pthread_specifics_size) return NULL;
   DEBUG("pthread_getspecific: key=%d pthread_specifics_size=%d\n", key, pthread_specifics_size);
   return pthread_specifics[key];
 }

 // condition variable functions

 int pthread_cond_init (pthread_cond_t* cond, const pthread_condattr_t* attr) {
   DEBUG("%s: start\n", __FUNCTION__);
     PTHREAD_COND_T_FLAG(cond) = 0;
     PTHREAD_COND_T_THREAD_COUNT(cond) = 0;
     PTHREAD_COND_T_COUNT_LOCK(cond) = 0;
     return 0;
 }

 int pthread_cond_destroy (pthread_cond_t* cond) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_cond_broadcast (pthread_cond_t* cond) {
   DEBUG("%s: start\n", __FUNCTION__);
     PTHREAD_COND_T_FLAG(cond) = 1;
     return 0;
 }

 int pthread_cond_wait (pthread_cond_t* cond, pthread_mutex_t* lock) {
   DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_COND_WAIT_START(cond);
     volatile int* thread_count  = &(PTHREAD_COND_T_THREAD_COUNT(cond));
     volatile int* flag = &(PTHREAD_COND_T_FLAG(cond));
     volatile int* count_lock    = &(PTHREAD_COND_T_COUNT_LOCK(cond));

     // dsm: ++/-- have higher precedence than *, so *thread_count++
     // increments *the pointer*, then dereferences it (!)
     (*thread_count)++;

     pthread_mutex_unlock(lock);
     while (1) {
         volatile int f = *flag;
         if (f == 1) {
             break;
         }
     }

     spin_lock(count_lock);

     (*thread_count)--;

     if (*thread_count == 0) {
         *flag = 0;
     }
     spin_unlock(count_lock);
     pthread_mutex_lock(lock);
     PROFILE_COND_WAIT_END(cond);
     return 0;
 }

 int pthread_cond_signal (pthread_cond_t* cond) {
   DEBUG("%s: start\n", __FUNCTION__);
     //Could also signal only one thread, but this is compliant too
     //TODO: Just wake one thread up
     return pthread_cond_broadcast(cond);
 }


 //barrier functions

 //These funny tree barriers will only work with consecutive TIDs starting from 0, e.g. a barrier initialized for 8 thread will need to be taken by TIDs 0-7
 //TODO: Adapt to work with arbitrary TIDs
 /*int pthread_barrier_init (pthread_barrier_t *restrict barrier,
                           const pthread_barrierattr_t *restrict attr, unsigned count)
 {
     assert(barrier != NULL);
     //assert(0 < count && count <= MAX_NUM_CPUS);

     PTHREAD_BARRIER_T_NUM_THREADS(barrier) = count;

     // add one to avoid false sharing
     tree_barrier_t* ptr
         = ((tree_barrier_t*)malloc((count + 1) * sizeof(tree_barrier_t))) + 1;
     for (unsigned i = 0; i < count; ++i) {
       ptr[i].value = 0;
     }

     PTHREAD_BARRIER_T_BARRIER_PTR(barrier) = ptr;

     return 0;
 }

 int pthread_barrier_destroy (pthread_barrier_t *barrier)
 {
     free(PTHREAD_BARRIER_T_BARRIER_PTR(barrier) - 1);
     return 0;
 }

 int pthread_barrier_wait (pthread_barrier_t* barrier)
 {
     int const num_threads = PTHREAD_BARRIER_T_NUM_THREADS(barrier);
     int const self = pthread_self();
     tree_barrier_t * const barrier_ptr = PTHREAD_BARRIER_T_BARRIER_PTR(barrier);

     int const goal = 1 - barrier_ptr[self].value;

     int round_mask = 3;
     while ((self & round_mask) == 0 && round_mask < (num_threads << 2)) {
       int const spacing = (round_mask + 1) >> 2;
       for (int i = 1; i <= 3 && self + i*spacing < num_threads; ++i) {
         while (barrier_ptr[self + i*spacing].value != goal) {
           // spin
         }
       }
       round_mask = (round_mask << 2) + 3;
     }

     barrier_ptr[self].value = goal;
     while (barrier_ptr[0].value != goal) {
       // spin
     }

     return 0;
 }*/

 int pthread_barrier_init (pthread_barrier_t *restrict barrier,
                           const pthread_barrierattr_t *restrict attr, unsigned count)
 {
     assert(barrier != NULL);
   DEBUG("%s: start\n", __FUNCTION__);

     PTHREAD_BARRIER_T_NUM_THREADS(barrier) =  count;
     PTHREAD_BARRIER_T_SPINLOCK(barrier) = 0;
     PTHREAD_BARRIER_T_COUNTER(barrier) = 0;
     PTHREAD_BARRIER_T_DIRECTION(barrier) = 0; //up

     return 0;
 }

 int pthread_barrier_destroy (pthread_barrier_t *barrier)
 {
   DEBUG("%s: start\n", __FUNCTION__);
     //Nothing to do
     return 0;
 }

 int pthread_barrier_wait (pthread_barrier_t* barrier)
 {
   DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_BARRIER_WAIT_START(barrier);
     int const initial_direction = PTHREAD_BARRIER_T_DIRECTION(barrier); //0 == up, 1 == down

     if (initial_direction == 0) {
        spin_lock(&(PTHREAD_BARRIER_T_SPINLOCK(barrier)));
        PTHREAD_BARRIER_T_COUNTER(barrier)++;
        if (PTHREAD_BARRIER_T_COUNTER(barrier) == PTHREAD_BARRIER_T_NUM_THREADS(barrier)) {
            //reverse direction, now down
            PTHREAD_BARRIER_T_DIRECTION(barrier) = 1;
        }
        spin_unlock(&(PTHREAD_BARRIER_T_SPINLOCK(barrier)));
     } else {
        spin_lock(&(PTHREAD_BARRIER_T_SPINLOCK(barrier)));
        PTHREAD_BARRIER_T_COUNTER(barrier)--;
        if (PTHREAD_BARRIER_T_COUNTER(barrier) == 0) {
           //reverse direction, now up
           PTHREAD_BARRIER_T_DIRECTION(barrier) = 0;
        }
        spin_unlock(&(PTHREAD_BARRIER_T_SPINLOCK(barrier)));
    }

    volatile int direction = PTHREAD_BARRIER_T_DIRECTION(barrier);
    while (initial_direction == direction) {
       //spin
       direction = PTHREAD_BARRIER_T_DIRECTION(barrier);
    }
    PROFILE_BARRIER_WAIT_END(barrier);
    return 0;
 }

 //misc functions

 static pthread_mutex_t __once_mutex = PTHREAD_MUTEX_INITIALIZER;
 int pthread_once (pthread_once_t* once,
                   void (*init)(void))
 {
   DEBUG("%s: start\n", __FUNCTION__);
   //fast path
   if (*once != PTHREAD_ONCE_INIT) return 0;
   pthread_mutex_lock(&__once_mutex);
   if (*once != PTHREAD_ONCE_INIT) {
     pthread_mutex_unlock(&__once_mutex);
     return 0;
   }
   *once = PTHREAD_ONCE_INIT+1;
   init();
   pthread_mutex_unlock(&__once_mutex);
   return 0;
 }

 #ifndef __USE_EXTERN_INLINES
 int pthread_equal (pthread_t t1, pthread_t t2)
 {
     return t1 == t2; //that was hard :-)
 }
 #endif

 // Functions that we want defined, but we don't use them
 // All other functions are not defined so that they will cause a compile time
 // error and we can decide if we need to do something with them

 // functions really don't need to do anything

 int pthread_yield() {
   DEBUG("%s: start\n", __FUNCTION__);
     // nothing else to yield to
     return 0;
 }

 int pthread_attr_init (pthread_attr_t* attr) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_attr_setscope (pthread_attr_t* attr, int scope) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_rwlockattr_init (pthread_rwlockattr_t* attr) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_attr_setstacksize (pthread_attr_t* attr, size_t stacksize) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_attr_setschedpolicy (pthread_attr_t* attr, int policy) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 // some functions that we don't really support

 int pthread_setconcurrency (int new_level) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_setcancelstate (int p0, int* p1)
 {
   DEBUG("%s: start\n", __FUNCTION__);
     //NPTL uses this
     return 0;
 }

 //and some affinity functions (used by libgomp, openmp)
 int pthread_getaffinity_np(pthread_t thread, size_t size, cpu_set_t *set) {
   DEBUG("%s: start\n", __FUNCTION__);
     char *p = (char*)set;
     while ( size-- ) *p++ = 0;
   return 0;
 }

 int pthread_setaffinity_np(pthread_t thread, size_t size, cpu_set_t *set) {
   DEBUG("%s: start\n", __FUNCTION__);
   return 0;
 }

 int pthread_attr_setaffinity_np(pthread_attr_t attr, size_t cpusetsize, const cpu_set_t *cpuset) {
   DEBUG("%s: start\n", __FUNCTION__);
   return 0;
 }

 int pthread_attr_getaffinity_np(pthread_attr_t attr, size_t cpusetsize, cpu_set_t *cpuset) {
   DEBUG("%s: start\n", __FUNCTION__);
   return 0;
 }


 // ... including any dealing with thread-level signal handling
 // (maybe we should throw an error message instead?)

 int pthread_sigmask (int how, const sigset_t* set, sigset_t* oset) {
   DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }

 int pthread_kill (pthread_t thread, int sig)  {
     assert(0);
 }

 // unimplemented pthread functions

 int pthread_atfork (void (*f0)(void),
                     void (*f1)(void),
                     void (*f2)(void))
 {
     assert(0);
 }

 int pthread_attr_destroy (pthread_attr_t* attr)
 {
     assert(0);
 }

 int pthread_attr_getdetachstate (const pthread_attr_t* attr,
                                  int* b)
 {
     assert(0);
 }

 int pthread_attr_getguardsize (const pthread_attr_t* restrict a,
                                size_t *restrict b)
 {
     assert(0);
 }

 int pthread_attr_getinheritsched (const pthread_attr_t *restrict a,
                                   int *restrict b)
 {
     assert(0);
 }

 int pthread_attr_getschedparam (const pthread_attr_t *restrict a,
                                 struct sched_param *restrict b)
 {
     assert(0);
 }

 int pthread_attr_getschedpolicy (const pthread_attr_t *restrict a,
                                  int *restrict b)
 {
     assert(0);
 }

 int pthread_attr_getscope (const pthread_attr_t *restrict a,
                            int *restrict b)
 {
     assert(0);
 }

 int pthread_attr_getstack (const pthread_attr_t *restrict a,
                            void* *restrict b,
                            size_t *restrict c)
 {
     assert(0);
 }

 int pthread_attr_getstackaddr (const pthread_attr_t *restrict a,
                                void* *restrict b)
 {
     assert(0);
 }

 int pthread_attr_getstacksize (const pthread_attr_t *restrict a,
                                size_t *restrict b)
 {
     assert(0);
 }

 int pthread_attr_setdetachstate (pthread_attr_t* a,
                                  int b)
 {
    return 0; //FIXME
 }
 int pthread_attr_setguardsize (pthread_attr_t* a,
                                size_t b)
 {
     assert(0);
 }

 int pthread_attr_setinheritsched (pthread_attr_t* a,
                                   int b)
 {
     assert(0);
 }

 int pthread_attr_setschedparam (pthread_attr_t *restrict a,
                                 const struct sched_param *restrict b)
 {
     assert(0);
 }

 int pthread_attr_setstack (pthread_attr_t* a,
                            void* b,
                            size_t c)
 {
     assert(0);
 }

 int pthread_attr_setstackaddr (pthread_attr_t* a,
                                void* b)
 {
     assert(0);
 }

 int pthread_cancel (pthread_t a)
 {
     assert(0);
 }

 void _pthread_cleanup_push (struct _pthread_cleanup_buffer *__buffer,
                             void (*__routine) (void *),
                             void *__arg)
 {
     assert(0);
 }

 void _pthread_cleanup_pop (struct _pthread_cleanup_buffer *__buffer,
                            int __execute)
 {
     assert(0);
 }

 int pthread_cond_timedwait (pthread_cond_t *restrict a,
                             pthread_mutex_t *restrict b,
                             const struct timespec *restrict c)
 {
     assert(0);
 }

 int pthread_condattr_destroy (pthread_condattr_t* a)
 {
     assert(0);
 }

 int pthread_condattr_getpshared (const pthread_condattr_t *restrict a,
                                  int *restrict b)
 {
     assert(0);
 }

 int pthread_condattr_init (pthread_condattr_t* a)
 {
     assert(0);
 }

 int pthread_condattr_setpshared (pthread_condattr_t* a,
                                  int b)
 {
     assert(0);
 }

 int pthread_detach (pthread_t a)
 {
     assert(0);
 }


 int pthread_getconcurrency ()
 {
     assert(0);
 }

 int pthread_getschedparam(pthread_t a,
                           int *restrict b,
                           struct sched_param *restrict c)
 {
     assert(0);
 }

 int pthread_mutex_getprioceiling (const pthread_mutex_t *restrict a,
                                   int *restrict b)
 {
     assert(0);
 }

 int pthread_mutex_setprioceiling (pthread_mutex_t *restrict a,
                                   int b,
                                   int *restrict c)
 {
     assert(0);
 }

 int pthread_mutex_timedlock (pthread_mutex_t* a,
                              const struct timespec* b)
 {
     assert(0);
 }

 int pthread_mutexattr_destroy (pthread_mutexattr_t* a)
 {
     //assert(0);
     //used by libc
     return 0;
 }

 int pthread_mutexattr_getprioceiling (const pthread_mutexattr_t *restrict a,
                                       int *restrict b)
 {
     assert(0);
 }

 int pthread_mutexattr_getprotocol (const pthread_mutexattr_t *restrict a,
                                    int *restrict b)
 {
     assert(0);
 }

 int pthread_mutexattr_getpshared (const pthread_mutexattr_t *restrict a,
                                   int *restrict b)
 {
     assert(0);
 }

 int pthread_mutexattr_gettype (const pthread_mutexattr_t *restrict a,
                                int *restrict b)
 {
     assert(0);
 }

 int pthread_mutexattr_init (pthread_mutexattr_t* a)
 {
     //assert(0);
     //used by libc
     return 0;
 }

 int pthread_mutexattr_setprioceiling (pthread_mutexattr_t* a,
                                       int b)
 {
     assert(0);
 }

 int pthread_mutexattr_setprotocol (pthread_mutexattr_t* a,
                                    int b)
 {
     assert(0);
 }

 int pthread_mutexattr_setpshared (pthread_mutexattr_t* a,
                                   int b)
 {
     assert(0);
 }

 int pthread_mutexattr_settype (pthread_mutexattr_t* a,
                                int b)
 {
     //assert(0);
     //used by libc
     //yeah, and the freaking libc just needs a recursive lock.... screw it
     //if (b == PTHREAD_MUTEX_RECURSIVE_NP) assert(0);
     return 0;
 }

 int pthread_rwlock_timedrdlock (pthread_rwlock_t *restrict a,
                                 const struct timespec *restrict b)
 {
     assert(0);
 }

 int pthread_rwlock_timedwrlock (pthread_rwlock_t *restrict a,
                                 const struct timespec *restrict b)
 {
     assert(0);
 }

 int pthread_rwlock_tryrdlock (pthread_rwlock_t* a)
 {
     assert(0);
 }

 int pthread_rwlock_trywrlock (pthread_rwlock_t* a)
 {
     assert(0);
 }

 int pthread_rwlockattr_destroy (pthread_rwlockattr_t* a)
 {
     assert(0);
 }

 int pthread_rwlockattr_getpshared (const pthread_rwlockattr_t *restrict a,
                                    int *restrict b)
 {
     assert(0);
 }

 int pthread_rwlockattr_setpshared(pthread_rwlockattr_t* a,
                                   int b)
 {
     assert(0);
 }

 int pthread_setcanceltype (int a,
                            int* b)
 {
     assert(0);
 }

 int pthread_setschedparam (pthread_t a,
                            int b,
                            const struct sched_param* c)
 {
     assert(0);
 }

 int pthread_setschedprio (pthread_t a,
                           int b)
 {
     assert(0);
 }

 void pthread_testcancel ()
 {
     assert(0);
 }


 /* Stuff to properly glue with glibc */

 // glibc keys

 //For NPTL, or LinuxThreads with TLS defined and used
 __thread void* __libc_tsd_MALLOC;
 __thread void* __libc_tsd_DL_ERROR;
 __thread void* __libc_tsd_RPC_VARS;
 //__thread void* __libc_tsd_LOCALE; seems to be defined in my libc already, but your glibc might not dfine it...
 //Defined in libgomp (OpenMP)
 //__thread void* __libc_tsd_CTYPE_B;
 //__thread void* __libc_tsd_CTYPE_TOLOWER;
 //__thread void* __libc_tsd_CTYPE_TOUPPER;

 //If glibc was not compiled with __thread, it uses __pthread_internal_tsd_get/set/address for its internal keys
 //These are from linuxthreads-0.7.1/specific.c

 //FIXME: When enabled, SPARC/M5 crashes (for some weird reason, libc calls a tsd_get on an uninitialized key at initialization, and uses its result). Are we supposed to initialize these values??
 //libc can live without these, so it's not critical
 #if 0
 enum __libc_tsd_key_t { _LIBC_TSD_KEY_MALLOC = 0,
                         _LIBC_TSD_KEY_DL_ERROR,
                         _LIBC_TSD_KEY_RPC_VARS,
                         _LIBC_TSD_KEY_LOCALE,
                         _LIBC_TSD_KEY_CTYPE_B,
                         _LIBC_TSD_KEY_CTYPE_TOLOWER,
                         _LIBC_TSD_KEY_CTYPE_TOUPPER,
                         _LIBC_TSD_KEY_N };
 __thread void* p_libc_specific[_LIBC_TSD_KEY_N]; /* thread-specific data for libc */

 int
 __pthread_internal_tsd_set (int key, const void * pointer)
 {
   p_libc_specific[key] = (void*) pointer;
   return 0;
 }

 void *
 __pthread_internal_tsd_get (int key)
 {
   return  p_libc_specific[key];
 }

 void ** __attribute__ ((__const__))
 __pthread_internal_tsd_address (int key)
 {
   return &p_libc_specific[key];
 }
 #endif //0


 //Aliases for glibc
 int __pthread_mutex_init (pthread_mutex_t* mutex, const pthread_mutexattr_t* attr)  __attribute__ ((weak, alias ("pthread_mutex_init")));
 int __pthread_mutex_lock (pthread_mutex_t* lock) __attribute__ ((weak, alias ("pthread_mutex_lock")));
 int __pthread_mutex_trylock (pthread_mutex_t* lock) __attribute__ ((weak, alias ("pthread_mutex_trylock")));
 int __pthread_mutex_unlock (pthread_mutex_t* lock) __attribute__ ((weak, alias ("pthread_mutex_unlock")));

 int __pthread_mutexattr_destroy (pthread_mutexattr_t* a) __attribute__ ((weak, alias ("pthread_mutexattr_destroy")));
 int __pthread_mutexattr_init (pthread_mutexattr_t* a) __attribute__ ((weak, alias ("pthread_mutexattr_init")));
 int __pthread_mutexattr_settype (pthread_mutexattr_t* a, int b) __attribute__ ((weak, alias ("pthread_mutexattr_settype")));

 int __pthread_rwlock_init (pthread_rwlock_t* lock, const pthread_rwlockattr_t* attr) __attribute__ ((weak, alias ("pthread_rwlock_init")));
 int __pthread_rwlock_rdlock (pthread_rwlock_t* lock) __attribute__ ((weak, alias ("pthread_rwlock_rdlock")));
 int __pthread_rwlock_wrlock (pthread_rwlock_t* lock) __attribute__ ((weak, alias ("pthread_rwlock_wrlock")));
 int __pthread_rwlock_unlock (pthread_rwlock_t* lock) __attribute__ ((weak, alias ("pthread_rwlock_unlock")));
 int __pthread_rwlock_destroy (pthread_rwlock_t* lock) __attribute__ ((weak, alias ("pthread_rwlock_destroy")));
 /*
 int   __pthread_key_create(pthread_key_t *, void (*)(void *)) __attribute__ ((weak, alias ("pthread_key_create")));
 int   __pthread_key_delete(pthread_key_t) __attribute__ ((weak, alias ("pthread_key_delete")));
 void* __pthread_getspecific(pthread_key_t) __attribute__ ((weak, alias ("pthread_getspecific")));
 int   __pthread_setspecific(pthread_key_t, const void *) __attribute__ ((weak, alias ("pthread_setspecific")));
 */
 int __pthread_once (pthread_once_t* once, void (*init)(void))  __attribute__ ((weak, alias ("pthread_once")));


 //No effect, NPTL-specific, may cause leaks? (TODO: Check!)
 void __nptl_deallocate_tsd() {}