Merge Sandia changes with upstream
diff --git a/pthread.c b/pthread.c
index efd4b44..d9316da 100644
--- a/pthread.c
+++ b/pthread.c
@@ -39,6 +39,8 @@
   #include "spinlock_alpha.h"
 #elif defined(__sparc)
   #include "spinlock_sparc.h"
+#elif defined (__arm__)
+  #include "spinlock_arm.h"
 #else
   #error "spinlock routines not available for your arch!\n"
 #endif
@@ -61,9 +63,9 @@
 #endif
 
 //Size and alignment requirements of "real" (NPTL/LinuxThreads) thread control block
-#define TCB_SIZE 512
-#define TCB_ALIGN sizeof(double)
-//TODO: Figure out real (NPTL/LinuxThreads) TCB space. 512 bytes should be enough.
+#define NPTL_TCB_SIZE 1184 // sizeof (struct pthread)
+#define NPTL_TCB_ALIGN sizeof(double)
+#define NPTL_TCBHEAD_T_SIZE (sizeof(tcbhead_t))
 
 //Thread control structure
 typedef struct {
@@ -133,26 +135,52 @@
   thread_block_info.stack_guard_size = 2048;
 
   //Total thread block size -- this is what we'll request to mmap
-  size_t sz = sizeof(pthread_tcb_t) + thread_block_info.tls_memsz + TCB_SIZE + thread_block_info.stack_guard_size + CHILD_STACK_SIZE;
+  #if TLS_TCB_AT_TP
+  size_t sz = sizeof(pthread_tcb_t) + thread_block_info.tls_memsz + NPTL_TCBHEAD_T_SIZE + thread_block_info.stack_guard_size + CHILD_STACK_SIZE;
+  #elif TLS_DTV_AT_TP
+  size_t sz = sizeof(pthread_tcb_t) + thread_block_info.tls_memsz + NPTL_TCB_SIZE + NPTL_TCBHEAD_T_SIZE + thread_block_info.stack_guard_size + CHILD_STACK_SIZE;
+  #else
+  #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
+  #endif
   //Note that TCB_SIZE is the "real" TCB size, not ours, which we leave zeroed (but some variables, notably errno, are somewhere inside there)
 
   //Align to multiple of CHILD_STACK_SIZE
   sz += CHILD_STACK_SIZE - 1;  
   thread_block_info.total_size = (sz>>CHILD_STACK_BITS)<<CHILD_STACK_BITS;
-
 }
 
-
 //Set up TLS block in current thread
+// @param th_block_addr:  beginning of entire thread memory space
 static void setup_thread_tls(void* th_block_addr) {
+  size_t tcb_offset = 0;
+  void *tlsblock = NULL;
+  char *tls_start_ptr = NULL;
+
+  #if TLS_DTV_AT_TP
+  th_block_addr += NPTL_TCB_SIZE;
+  #endif
+
   /* Compute the (real) TCB offset */
-  size_t tcb_offset = roundup(thread_block_info.tls_memsz, TCB_ALIGN);
+  #if TLS_DTV_AT_TP
+  tcb_offset = roundup(NPTL_TCBHEAD_T_SIZE, NPTL_TCB_ALIGN);
+  #elif TLS_TCB_AT_TP
+  tcb_offset = roundup(thread_block_info.tls_memsz, NPTL_TCB_ALIGN);
+  #else
+  #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
+  #endif
+
   /* Align the TLS block.  */
-  void* tlsblock = (void *) (((uintptr_t) th_block_addr + thread_block_info.tls_align - 1)
+  tlsblock = (void *) (((uintptr_t) th_block_addr + thread_block_info.tls_align - 1)
                        & ~(thread_block_info.tls_align - 1));
   /* Initialize the TLS block.  */
-  char* tls_start_ptr = ((char *) tlsblock + tcb_offset
-                           - roundup (thread_block_info.tls_memsz, thread_block_info.tls_align ?: 1));
+  #if TLS_DTV_AT_TP
+  tls_start_ptr = ((char *) tlsblock + tcb_offset);
+  #elif TLS_TCB_AT_TP
+  tls_start_ptr = ((char *) tlsblock + tcb_offset
+                       - roundup (thread_block_info.tls_memsz, thread_block_info.tls_align ?: 1));
+  #else
+  #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
+  #endif
 
   //DEBUG("Init TLS: Copying %d bytes from 0x%llx to 0x%llx\n", filesz, (uint64_t) initimage, (uint64_t) tls_start_ptr);
   memcpy (tls_start_ptr, thread_block_info.tls_initimage, thread_block_info.tls_filesz);
@@ -161,7 +189,13 @@
 
   //Note: We don't care about DTV pointers for x86/SPARC -- they're never used in static mode
   /* Initialize the thread pointer.  */
+  #if TLS_DTV_AT_TP
+  TLS_INIT_TP (tlsblock, 0);
+  #elif TLS_TCB_AT_TP
   TLS_INIT_TP ((char *) tlsblock + tcb_offset, 0);
+  #else
+  #error "TLS_TCB_AT_TP xor TLS_DTV_AT_TP must be defined"
+  #endif
 }
 
 //Some NPTL definitions
@@ -174,7 +208,7 @@
   __libc_multiple_threads = 1; //tell libc we're multithreaded (NPTL-specific)
   populate_thread_block_info();
   void* ptr = mmap(0, thread_block_info.total_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
-  setup_thread_tls(ptr);
+  setup_thread_tls(ptr + sizeof(pthread_tcb_t));
 }
 
 
@@ -211,7 +245,7 @@
   tcb->child_finished = 0;
   tcb->start_routine = start_routine;
   tcb->arg = arg;
-  tcb->tls_start_addr = (void*)(((char*)thread_block) + sizeof(pthread_tcb_t)); //right after tcb
+  tcb->tls_start_addr = (void*)(((char*)thread_block) + sizeof(pthread_tcb_t)); //right after m5's tcb
   tcb->stack_start_addr = (void*) (((char*) thread_block) + thread_block_size - thread_block_info.stack_guard_size); //end of thread_block
   
   *thread=(pthread_t) thread_block;
diff --git a/spinlock_arm.h b/spinlock_arm.h
new file mode 100644
index 0000000..6f6803e
--- /dev/null
+++ b/spinlock_arm.h
@@ -0,0 +1,70 @@
+/*
+    m5threads, a pthread library for the M5 simulator
+    Copyright (C) 2009, Stanford University
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+*/
+
+
+#ifndef __SPINLOCK_ARM_H__
+#define __SPINLOCK_ARM_H__
+
+static __inline__ void spin_lock (volatile int* lock) {
+    unsigned long tmp;
+  
+         __asm__ __volatile__(
+"1:     ldrex   %0, [%1]\n"
+"       cmp     %0, #0\n"
+"       strexeq %0, %2, [%1]\n"
+"       cmpeq   %0, #0\n"
+"       bne     1b\n"
+"       dmb\n"
+        : "=&r" (tmp)
+        : "r" (lock), "r" (1)
+        : "cc");
+
+}
+
+static __inline__ void spin_unlock (volatile int* lock) {
+
+
+     __asm__ __volatile__(
+"       dmb\n"
+"       str     %1, [%0]\n"
+        :
+        : "r" (lock), "r" (0)
+        : "cc");
+}
+
+
+static __inline__ int trylock (volatile int* lock) {
+        unsigned long tmp;
+
+        __asm__ __volatile__(
+"       ldrex   %0, [%1]\n"
+"       cmp     %0, #0\n"
+"       strexeq %0, %2, [%1]\n"
+"       eor     %0, %0, #1\n"
+"       bne     fail\n"
+"       dmb\n"
+"fail:     nop\n"
+        : "=&r" (tmp)
+        : "r" (lock), "r" (1)
+        : "cc", "memory");
+
+        return tmp;
+}
+
+#endif  // __SPINLOCK_ARM_H__
diff --git a/tests/Makefile b/tests/Makefile
index 59224b7..4fc059b 100755
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -4,20 +4,27 @@
 
 # 64-bit compiles
 #Uncomment to use sparc/alpha cross-compilers
-CC := sparc64-unknown-linux-gnu-gcc
-CPP := sparc64-unknown-linux-gnu-g++
+#CC := sparc64-unknown-linux-gnu-gcc
+#CPP := sparc64-unknown-linux-gnu-g++
 #CC := alpha-unknown-linux-gnu-gcc
 #CPP := alpha-unknown-linux-gnu-g++
-
+CC := arm-linux-gnueabi-gcc
+CPP := arm-linux-gnueabi-g++
 #CC := gcc
 #CPP := g++
 
+# Needed for support of v7 assembly instructions on ARM architecture
+ARM_FLAGS := -march=armv7-a -marm
+
 #CFLAGS := -ggdb3 -O3 -D__DEBUG
-CFLAGS := -g -O3 -DM5_PROFILING
+#CFLAGS := -g -O3 -DM5_PROFILING
+CFLAGS := -g -O3 $(ARM_FLAGS)
 
 CPPFLAGS := $(CFLAGS)
 
+# ARM support for OpenMP not tested (test_omp.o)
 TEST_OBJS := test_stackgrow.o test_pthreadbasic.o test_pthread.o test_atomic.o test_barrier.o test_lock.o test_malloc.o test_sieve.o  test___thread.o test_omp.o
+#TEST_OBJS := test_stackgrow.o test_pthreadbasic.o test_pthread.o test_atomic.o test_barrier.o test_lock.o test_malloc.o test_sieve.o  test___thread.o
 
 TEST_PROGS := $(TEST_OBJS:.o=)
 
diff --git a/tests/test___thread.cpp b/tests/test___thread.cpp
index d597155..1a4617f 100644
--- a/tests/test___thread.cpp
+++ b/tests/test___thread.cpp
@@ -37,7 +37,7 @@
 {
     long long int id = (long long int)arg;
     int i;
-    printf("&local[%d]=%p\n", id, &local);
+    printf("&local[%lld]=%p\n", id, &local);
     local += id;
     for (i = 0; i < count; i++) {
         local++;
@@ -79,13 +79,13 @@
     }
 
     long long int local = (long long int)run((void*)0);
-    printf("local[0] = %d\n", local);
+    printf("local[0] = %lld\n", local);
 
     for (i = 1 ; i < thread_count; i++) {
         int joinResult = pthread_join(threads[i], 
                                       (void**)&local);
         assert(joinResult == 0);
-        printf("local[%d] = %d\n", i, local);
+        printf("local[%d] = %lld\n", i, local);
     }
     
     /*struct timeval endTime;
diff --git a/tests/test_atomic.cpp b/tests/test_atomic.cpp
index 5ead4cb..0deaa3a 100644
--- a/tests/test_atomic.cpp
+++ b/tests/test_atomic.cpp
@@ -48,7 +48,7 @@
     pthread_mutex_lock(&lock);
 
     int current = next;
-    printf("[Iteration %d, Thread %d] Got lock\n", iteration, id);
+    printf("[Iteration %d, Thread %lld] Got lock\n", iteration, id);
     intArray[current]++;
 
     //Uncomment this snip for longer-running critical section
@@ -60,7 +60,7 @@
 
     next = id;
 
-    printf("[Iteration %d, Thread %d] Critical section done, previously next=%d, now next=%d\n", iteration, id, current, next);
+    printf("[Iteration %d, Thread %lld] Critical section done, previously next=%d, now next=%d\n", iteration, id, current, next);
     pthread_mutex_unlock(&lock);
 
     pthread_barrier_wait(&barrier);
diff --git a/tests/test_barrier.cpp b/tests/test_barrier.cpp
index 68683e5..0e7819b 100644
--- a/tests/test_barrier.cpp
+++ b/tests/test_barrier.cpp
@@ -29,9 +29,9 @@
 void* run (void* arg) {
     long long int my_id = (long long int) arg;
     //A[my_id][0]++;
-    printf("%i BEFORE\n", my_id);
+    printf("%lli BEFORE\n", my_id);
     pthread_barrier_wait(&barrier);
-    printf("%i AFTER\n", my_id);
+    printf("%lli AFTER\n", my_id);
     //A[my_id][0]++;
     return NULL;
 }
diff --git a/tests/test_lock.cpp b/tests/test_lock.cpp
index f8670f6..da39a67 100644
--- a/tests/test_lock.cpp
+++ b/tests/test_lock.cpp
@@ -27,7 +27,7 @@
 void* run1(void* arglist)
 {
     pthread_t id = pthread_self();
-    printf("[run1] TID=%d\n", id);
+    printf("[run1] TID=%d\n", (int)id);
 
     printf("[run1] started\n");
 
diff --git a/tests/test_malloc.cpp b/tests/test_malloc.cpp
index 5d6ca88..3c6c591 100644
--- a/tests/test_malloc.cpp
+++ b/tests/test_malloc.cpp
@@ -50,12 +50,12 @@
     int bytes = iteration*(id +1);
     void* ptr = malloc(bytes);
     ptr_matrix[iteration][id] = ptr;
-    printf("[ALLOC %d, Thread %d] Allocated %d bytes, from %x to %x\n", iteration, id, bytes, (uint32)ptr, (uint32)(((char*)ptr) + bytes - 1));
+    printf("[ALLOC %d, Thread %lld] Allocated %d bytes, from %p to %p\n", iteration, id, bytes, ptr, ((char*)ptr) + bytes - 1);
 
     pthread_barrier_wait(&barrier);
     int target = (id + iteration) % nthreads;
     free(ptr_matrix[iteration][target]);
-    printf("[ALLOC %d, Thread %d] Freed %d's allocation, %x\n", iteration, id, target, (uint32)ptr_matrix[iteration][target]);
+    printf("[ALLOC %d, Thread %lld] Freed %d's allocation, %p\n", iteration, id, target, ptr_matrix[iteration][target]);
     //free(ptr_matrix[iteration][target]);
     return NULL;
 }
diff --git a/tests/test_pthreadbasic.cpp b/tests/test_pthreadbasic.cpp
index 85af155..001123c 100644
--- a/tests/test_pthreadbasic.cpp
+++ b/tests/test_pthreadbasic.cpp
@@ -35,7 +35,7 @@
     pthread_t pth;
     pthread_attr_t attr;
 
-    printf("Main thread initialized. TID=%d\n", pthread_self());
+    printf("Main thread initialized. TID=%d\n", (int)pthread_self());
     int result = pthread_attr_init(&attr);
     assert(result == 0);
     printf("Main thread called pthread_attr_init\n");
@@ -52,11 +52,11 @@
     printf("Main thread creating 2nd thread...\n");
     result = pthread_create(&pth2, &attr, run, NULL);
 
-    printf("Main thread calling join w/ 1st thread (id=%llx)... (self=%llx)\n", pth, pthread_self());
+    printf("Main thread calling join w/ 1st thread (id=%lx)... (self=%lx)\n", pth, pthread_self());
     pthread_join(pth, NULL);
-    printf("Main thread calling join w/ 2nd thread (id=%llx)... (self=%llx)\n", pth2, pthread_self());
+    printf("Main thread calling join w/ 2nd thread (id=%lx)... (self=%lx)\n", pth2, pthread_self());
     pthread_join(pth2, NULL);
-    printf("Main thread has self=%d\n", pthread_self());
+    printf("Main thread has self=%d\n", (int)pthread_self());
 
     printf("Main thread done.\n");
 }
diff --git a/tests/test_stackgrow.cpp b/tests/test_stackgrow.cpp
index 5386a32..5f49620 100644
--- a/tests/test_stackgrow.cpp
+++ b/tests/test_stackgrow.cpp
@@ -31,7 +31,7 @@
 
 void func (int* f1) {
   int f2;
-  printf("Addr frame 1 = %llx, Addr frame 2 = %llx\n", f1, &f2);
+  printf("Addr frame 1 = %p, Addr frame 2 = %p\n", f1, &f2);
   if (&f2 > f1) {
     printf("Stack grows up (and this threading library needs to be fixed for your arch...)\n");
   } else {
diff --git a/tls_defs.h b/tls_defs.h
index 0b51a06..154275b 100644
--- a/tls_defs.h
+++ b/tls_defs.h
@@ -28,8 +28,19 @@
 //These are mostly taken verbatim from glibc 2.3.6
 
 //32 for ELF32 binaries, 64 for ELF64
-//TODO: Macro it
+#if defined(__LP64__)
 #define __ELF_NATIVE_CLASS 64
+#else
+#define __ELF_NATIVE_CLASS 32
+#endif
+
+//Seems like all non-ARM M5 targets use TLS_TCB_AT_TP (defined in
+//  platform-specific 'tls.h')
+#if defined(__arm__)
+#define TLS_DTV_AT_TP 1
+#else
+#define TLS_TCB_AT_TP 1
+#endif
 
 /* Standard ELF types.  */
 
@@ -164,6 +175,47 @@
 # define TLS_INIT_TP(descr, secondcall) \
   (__thread_self = (__typeof (__thread_self)) (descr), NULL)
 
+#elif defined (__arm__)
+
+typedef struct
+{
+  void *dtv;
+  void *private;
+} tcbhead_t;
+
+#define INTERNAL_SYSCALL_RAW(name, err, nr, args...)        \
+  ({ unsigned int _sys_result;                  \
+     {                              \
+       register int _a1 asm ("a1");             \
+       LOAD_ARGS_##nr (args)                    \
+           asm volatile ("mov r7, #0xf0000\n"    \
+                     "add r7, r7, #0x0005\n"  \
+         "swi   #0  @ syscall " #name       \
+             : "=r" (_a1)               \
+             : "i" (name) ASM_ARGS_##nr         \
+             : "memory");               \
+       _sys_result = _a1;                   \
+     }                              \
+     (int) _sys_result; })
+
+#undef INTERNAL_SYSCALL_ARM
+#define INTERNAL_SYSCALL_ARM(name, err, nr, args...)        \
+    INTERNAL_SYSCALL_RAW(__ARM_NR_##name, err, nr, args)
+
+#define LOAD_ARGS_0()
+
+#define ASM_ARGS_0
+
+#define LOAD_ARGS_1(a1)             \
+  int _a1tmp = (int) (a1);          \
+  LOAD_ARGS_0 ()                \
+  _a1 = _a1tmp;
+
+#define ASM_ARGS_1  ASM_ARGS_0, "r" (_a1)
+
+# define TLS_INIT_TP(descr, secondcall) \
+    INTERNAL_SYSCALL_ARM(set_tls, 0, 1, (descr))
+
 #else
   #error "No TLS defs for your architecture"
 #endif