Merge Sandia changes with upstream
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..3eef998
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,9 @@
+CC = gcc
+CFLAGS = -O3 -static
+
+.PHONY: all clean
+
+all:  libpthread.a(pthread.o)
+
+clean:
+	$(RM) *.o libpthread.a
diff --git a/pthread.c b/pthread.c
index 2d42e07..d9316da 100644
--- a/pthread.c
+++ b/pthread.c
@@ -334,11 +334,13 @@
 // mutex functions
 
 int pthread_mutex_init (pthread_mutex_t* mutex, const pthread_mutexattr_t* attr) {
+  DEBUG("%s: start\n", __FUNCTION__);
     mutex->PTHREAD_MUTEX_T_COUNT = 0;
     return 0;
 }
 
 int pthread_mutex_lock (pthread_mutex_t* lock) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_LOCK_START(lock); 
     spin_lock((int*)&lock->PTHREAD_MUTEX_T_COUNT);
     PROFILE_LOCK_END(lock);
@@ -346,6 +348,7 @@
 }
 
 int pthread_mutex_unlock (pthread_mutex_t* lock) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_UNLOCK_START(lock);
     spin_unlock((int*)&lock->PTHREAD_MUTEX_T_COUNT);
     PROFILE_UNLOCK_END(lock);
@@ -353,10 +356,12 @@
 }
 
 int pthread_mutex_destroy (pthread_mutex_t* mutex) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_mutex_trylock (pthread_mutex_t* mutex) {
+  DEBUG("%s: start\n", __FUNCTION__);
     int acquired = trylock((int*)&mutex->PTHREAD_MUTEX_T_COUNT);
     if (acquired == 1) {
 	//Profiling not really accurate here...
@@ -370,6 +375,7 @@
 // rwlock functions
 
 int pthread_rwlock_init (pthread_rwlock_t* lock, const pthread_rwlockattr_t* attr) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PTHREAD_RWLOCK_T_LOCK(lock) = 0; // used only with spin_lock, so we know to initilize to zero
     PTHREAD_RWLOCK_T_READERS(lock) = 0;
     PTHREAD_RWLOCK_T_WRITER(lock) = -1; // -1 means no one owns the write lock
@@ -378,10 +384,12 @@
 }
 
 int pthread_rwlock_destroy (pthread_rwlock_t* lock) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_rwlock_rdlock (pthread_rwlock_t* lock) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_LOCK_START(lock);
     do {
         // this is to reduce the contention and a possible live-lock to lock->access_lock
@@ -406,6 +414,7 @@
 }
 
 int pthread_rwlock_wrlock (pthread_rwlock_t* lock) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_LOCK_START(lock);
     do {
         while (1) {
@@ -433,6 +442,7 @@
 }
 
 int pthread_rwlock_unlock (pthread_rwlock_t* lock) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_UNLOCK_START(lock);
     spin_lock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     if (pthread_self() == PTHREAD_RWLOCK_T_WRITER(lock)) {
@@ -463,6 +473,7 @@
 
 int pthread_key_create (pthread_key_t* key, void (*destructor)(void*)) {
   int i;
+  DEBUG("%s: start\n", __FUNCTION__);
 
   pthread_mutex_lock(&pthread_keys_mutex);
   for (i = 0; i < PTHREAD_KEYS_MAX; i++) {
@@ -481,6 +492,7 @@
 
 int pthread_key_delete (pthread_key_t key)
 {
+  DEBUG("%s: start\n", __FUNCTION__);
   pthread_mutex_lock(&pthread_keys_mutex);
   if (key >= PTHREAD_KEYS_MAX || !pthread_keys[key].in_use) {
     pthread_mutex_unlock(&pthread_keys_mutex);
@@ -499,17 +511,12 @@
 
 int pthread_setspecific (pthread_key_t key, const void* value) {
   int m_size;
+  DEBUG("%s: start\n", __FUNCTION__);
   if (key < 0 || key >= PTHREAD_KEYS_MAX) return EINVAL; 
-  if (key >= pthread_specifics_size) {
-    m_size = (key+1)*sizeof(void*);
-    if (pthread_specifics_size == 0) {
-       pthread_specifics = (void**) malloc(m_size);
-       DEBUG("pthread_setspecific: malloc of size %d bytes, got 0x%llx\n", m_size, pthread_specifics);
-    } else {
-       pthread_specifics = (void**) realloc(pthread_specifics, m_size);
-       DEBUG("pthread_setspecific: realloc of size %d bytes, got 0x%llx\n", m_size, pthread_specifics);
-    }
-    pthread_specifics_size = key+1;
+  if (pthread_specifics_size == 0) {
+     pthread_specifics = (void**) calloc(PTHREAD_KEYS_MAX + 1, sizeof(void*));
+     DEBUG("pthread_setspecific: malloc of size %d bytes, got 0x%llx\n", m_size, pthread_specifics);
+     pthread_specifics_size = key+1;
   }
   pthread_specifics[key] = (void*) value;
   return 0;
@@ -524,6 +531,7 @@
 // condition variable functions
 
 int pthread_cond_init (pthread_cond_t* cond, const pthread_condattr_t* attr) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PTHREAD_COND_T_FLAG(cond) = 0;
     PTHREAD_COND_T_THREAD_COUNT(cond) = 0;
     PTHREAD_COND_T_COUNT_LOCK(cond) = 0;
@@ -531,15 +539,18 @@
 }
 
 int pthread_cond_destroy (pthread_cond_t* cond) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_cond_broadcast (pthread_cond_t* cond) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PTHREAD_COND_T_FLAG(cond) = 1;
     return 0;
 }
 
 int pthread_cond_wait (pthread_cond_t* cond, pthread_mutex_t* lock) {
+  DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_COND_WAIT_START(cond);
     volatile int* thread_count  = &(PTHREAD_COND_T_THREAD_COUNT(cond));
     volatile int* flag = &(PTHREAD_COND_T_FLAG(cond));
@@ -571,6 +582,7 @@
 }
 
 int pthread_cond_signal (pthread_cond_t* cond) {
+  DEBUG("%s: start\n", __FUNCTION__);
     //Could also signal only one thread, but this is compliant too
     //TODO: Just wake one thread up
     return pthread_cond_broadcast(cond);
@@ -638,6 +650,7 @@
                           const pthread_barrierattr_t *restrict attr, unsigned count)
 {
     assert(barrier != NULL);
+  DEBUG("%s: start\n", __FUNCTION__);
 
     PTHREAD_BARRIER_T_NUM_THREADS(barrier) =  count;
     PTHREAD_BARRIER_T_SPINLOCK(barrier) = 0;
@@ -649,12 +662,14 @@
 
 int pthread_barrier_destroy (pthread_barrier_t *barrier)
 {
+  DEBUG("%s: start\n", __FUNCTION__);
     //Nothing to do
     return 0;
 }
 
 int pthread_barrier_wait (pthread_barrier_t* barrier)
 {
+  DEBUG("%s: start\n", __FUNCTION__);
     PROFILE_BARRIER_WAIT_START(barrier);
     int const initial_direction = PTHREAD_BARRIER_T_DIRECTION(barrier); //0 == up, 1 == down
 
@@ -691,6 +706,7 @@
 int pthread_once (pthread_once_t* once,
                   void (*init)(void))
 {
+  DEBUG("%s: start\n", __FUNCTION__);
   //fast path
   if (*once != PTHREAD_ONCE_INIT) return 0;
   pthread_mutex_lock(&__once_mutex);
@@ -718,56 +734,70 @@
 // functions really don't need to do anything
 
 int pthread_yield() {
+  DEBUG("%s: start\n", __FUNCTION__);
     // nothing else to yield to
     return 0;
 }
 
 int pthread_attr_init (pthread_attr_t* attr) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_attr_setscope (pthread_attr_t* attr, int scope) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_rwlockattr_init (pthread_rwlockattr_t* attr) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_attr_setstacksize (pthread_attr_t* attr, size_t stacksize) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_attr_setschedpolicy (pthread_attr_t* attr, int policy) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 // some functions that we don't really support
 
 int pthread_setconcurrency (int new_level) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
 int pthread_setcancelstate (int p0, int* p1)
 {
+  DEBUG("%s: start\n", __FUNCTION__);
     //NPTL uses this
     return 0;
 }
 
 //and some affinity functions (used by libgomp, openmp)
 int pthread_getaffinity_np(pthread_t thread, size_t size, cpu_set_t *set) {
+  DEBUG("%s: start\n", __FUNCTION__);
+    char *p = (char*)set;
+    while ( size-- ) *p++ = 0;
   return 0;
 }
 
 int pthread_setaffinity_np(pthread_t thread, size_t size, cpu_set_t *set) {
+  DEBUG("%s: start\n", __FUNCTION__);
   return 0;
 }
 
 int pthread_attr_setaffinity_np(pthread_attr_t attr, size_t cpusetsize, const cpu_set_t *cpuset) {
+  DEBUG("%s: start\n", __FUNCTION__);
   return 0;
 }
 
 int pthread_attr_getaffinity_np(pthread_attr_t attr, size_t cpusetsize, cpu_set_t *cpuset) {
+  DEBUG("%s: start\n", __FUNCTION__);
   return 0;
 }
 
@@ -776,6 +806,7 @@
 // (maybe we should throw an error message instead?)
 
 int pthread_sigmask (int how, const sigset_t* set, sigset_t* oset) {
+  DEBUG("%s: start\n", __FUNCTION__);
     return 0;
 }
 
diff --git a/spinlock_x86.h b/spinlock_x86.h
index 0da99d8..b816269 100644
--- a/spinlock_x86.h
+++ b/spinlock_x86.h
@@ -29,10 +29,10 @@
         (
          "\n1:\t" \
          "cmpb $0,%1\n\t" \
-         "jne 1b\n\t" \
+         "ja 1b\n\t" \
          "xchgb %b0, %1\n\t" \
          "cmpb $0,%0\n" \
-         "jne 1b\n\t"
+         "ja 1b\n\t"
          :"=q"(oldval), "=m"(*lock)
          : "0"(1)
          : "memory");