Profiling hooks

- Added profiling hooks to measure synchronization usage and overhead in M5
- Updated README file
diff --git a/README b/README
index 4600ee7..cd4f485 100644
--- a/README
+++ b/README
@@ -5,6 +5,13 @@
 
 Changelog
 ---------
+
+14-Feb-09
+- Added support for OpenMP in SPARC.
+- Fixed stack guard to work in SPARC64 (stack bias was insufficient).
+- Added optional profiling hooks to measure synchronization use. Compile with -DM5_PROFILING to use M5 profiling syscalls.
+- The Makefile now builds test programs linked with both m5threads (test_XXX) and the standard pthread library (test_XXX_p). This is done for debugging purposes, but note that **the _p binaries won't work in M5**.
+
 27-Jan-09
 - Added support for TLS in SPARC and x86-64 in static binaries. Alpha no longer works due to having unimplemented TLS support.
 - Fixed a race condition in rwlocks and condition variables.
diff --git a/profiling_hooks.h b/profiling_hooks.h
new file mode 100644
index 0000000..98e57a1
--- /dev/null
+++ b/profiling_hooks.h
@@ -0,0 +1,66 @@
+/*
+    m5threads, a pthread library for the M5 simulator
+    Copyright (C) 2009, Stanford University
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+
+    Author: Daniel Sanchez
+*/
+
+/* Profiling hooks used by m5threads to measure synchronization usage */
+
+//TODO: Profiling hooks for non-M5 mode
+
+#if defined(M5_PROFILING)
+
+  /* M5 profiling syscall asm */
+  #if defined (__sparc)
+    #define m5_prof_syscall(syscall_num, arg) __asm__ __volatile__ ( \
+       "mov " #syscall_num ", %%g1\n\t" \
+       "mov %0, %%o0\n\t" \
+       "ta 0x6d\n\t" \
+       :: "r"(arg) : "g1", "o0" \
+    );
+  #else
+    #error "M5 profiling hooks not implemented for your architecture, write them"
+  #endif
+
+  #define PROFILE_LOCK_START(addr) m5_prof_syscall(1040, addr)
+  #define PROFILE_LOCK_END(addr) m5_prof_syscall(1041, addr)
+
+  #define PROFILE_UNLOCK_START(addr) m5_prof_syscall(1042, addr)
+  #define PROFILE_UNLOCK_END(addr) m5_prof_syscall(1043, addr)
+
+  #define PROFILE_BARRIER_WAIT_START(addr) m5_prof_syscall(1044, addr)
+  #define PROFILE_BARRIER_WAIT_END(addr) m5_prof_syscall(1045, addr)
+
+  #define PROFILE_COND_WAIT_START(addr) m5_prof_syscall(1046, addr)
+  #define PROFILE_COND_WAIT_END(addr) m5_prof_syscall(1047, addr)
+
+#else
+  /* Empty hooks */
+  #define PROFILE_LOCK_START(addr)
+  #define PROFILE_LOCK_END(addr)
+
+  #define PROFILE_UNLOCK_START(addr)
+  #define PROFILE_UNLOCK_END(addr)
+
+  #define PROFILE_BARRIER_WAIT_START(addr)
+  #define PROFILE_BARRIER_WAIT_END(addr)
+
+  #define PROFILE_COND_WAIT_START(addr)
+  #define PROFILE_COND_WAIT_END(addr)
+#endif
+
diff --git a/pthread.c b/pthread.c
index 972e47a..b9b85d9 100644
--- a/pthread.c
+++ b/pthread.c
@@ -45,6 +45,7 @@
 
 #include "pthread_defs.h"
 #include "tls_defs.h"
+#include "profiling_hooks.h"
 
 #define restrict 
 
@@ -126,8 +127,9 @@
   }
 
   //Set a stack guard size
-  //In SPARC/M5, this is needed to avoid out-of-range accesses on register saves...
-  //See src/arch/sparc/process.hh -- sets stackBias to 2047
+  //In SPARC, this is actually needed to avoid out-of-range accesses on register saves...
+  //Largest I have seen is 2048 (sparc64)
+  //You could avoid this in theory by compiling with -mnostack-bias
   thread_block_info.stack_guard_size = 2048;
 
   //Total thread block size -- this is what we'll request to mmap
@@ -303,12 +305,16 @@
 }
 
 int pthread_mutex_lock (pthread_mutex_t* lock) {
+    PROFILE_LOCK_START(lock); 
     spin_lock((int*)&lock->PTHREAD_MUTEX_T_COUNT);
+    PROFILE_LOCK_END(lock);
     return 0;
 }
 
 int pthread_mutex_unlock (pthread_mutex_t* lock) {
+    PROFILE_UNLOCK_START(lock);
     spin_unlock((int*)&lock->PTHREAD_MUTEX_T_COUNT);
+    PROFILE_UNLOCK_END(lock);
     return 0;
 }
 
@@ -319,6 +325,9 @@
 int pthread_mutex_trylock (pthread_mutex_t* mutex) {
     int acquired = trylock((int*)&mutex->PTHREAD_MUTEX_T_COUNT);
     if (acquired == 1) {
+	//Profiling not really accurate here...
+	PROFILE_LOCK_START(mutex);
+	PROFILE_LOCK_END(mutex);
         return 0;
     }
     return EBUSY;
@@ -339,6 +348,7 @@
 }
 
 int pthread_rwlock_rdlock (pthread_rwlock_t* lock) {
+    PROFILE_LOCK_START(lock);
     do {
         // this is to reduce the contention and a possible live-lock to lock->access_lock
         while (1) {
@@ -352,14 +362,17 @@
         if ((pthread_t)PTHREAD_RWLOCK_T_WRITER(lock) == -1) {
             PTHREAD_RWLOCK_T_READERS(lock)++;
             spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
+	    PROFILE_LOCK_END(lock);
             return 0;
         }
         spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     } while (1);
+    PROFILE_LOCK_END(lock);
     return 0;
 }
 
 int pthread_rwlock_wrlock (pthread_rwlock_t* lock) {
+    PROFILE_LOCK_START(lock);
     do {
         while (1) {
             pthread_t writer = PTHREAD_RWLOCK_T_WRITER(lock);
@@ -376,14 +389,17 @@
         if ((pthread_t)PTHREAD_RWLOCK_T_WRITER(lock) == -1 && PTHREAD_RWLOCK_T_READERS(lock) == 0) {
             PTHREAD_RWLOCK_T_WRITER(lock) = pthread_self();
             spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
+	    PROFILE_LOCK_END(lock);
             return 0;
         }
         spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     } while (1);
+    PROFILE_LOCK_END(lock);
     return 0;
 }
 
 int pthread_rwlock_unlock (pthread_rwlock_t* lock) {
+    PROFILE_UNLOCK_START(lock);
     spin_lock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
     if (pthread_self() == PTHREAD_RWLOCK_T_WRITER(lock)) {
         // the write lock will be released
@@ -393,6 +409,7 @@
         PTHREAD_RWLOCK_T_READERS(lock) = PTHREAD_RWLOCK_T_READERS(lock) - 1;
     }
     spin_unlock((int*)&(PTHREAD_RWLOCK_T_LOCK(lock)));
+    PROFILE_UNLOCK_END(lock);
     return 0;
 }
 
@@ -489,6 +506,7 @@
 }
 
 int pthread_cond_wait (pthread_cond_t* cond, pthread_mutex_t* lock) {
+    PROFILE_COND_WAIT_START(cond);
     volatile int* thread_count  = &(PTHREAD_COND_T_THREAD_COUNT(cond));
     volatile int* flag = &(PTHREAD_COND_T_FLAG(cond));
     volatile int* count_lock    = &(PTHREAD_COND_T_COUNT_LOCK(cond));
@@ -514,6 +532,7 @@
     }
     spin_unlock(count_lock);
     pthread_mutex_lock(lock);
+    PROFILE_COND_WAIT_END(cond);
     return 0;
 }
 
@@ -602,6 +621,7 @@
 
 int pthread_barrier_wait (pthread_barrier_t* barrier)
 {
+    PROFILE_BARRIER_WAIT_START(barrier);
     int const initial_direction = PTHREAD_BARRIER_T_DIRECTION(barrier); //0 == up, 1 == down
 
     if (initial_direction == 0) {
@@ -627,7 +647,7 @@
       //spin
       direction = PTHREAD_BARRIER_T_DIRECTION(barrier);
    }
-
+   PROFILE_BARRIER_WAIT_END(barrier);
    return 0;
 }
 
diff --git a/tests/Makefile b/tests/Makefile
index ddd295f..59224b7 100755
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -13,7 +13,7 @@
 #CPP := g++
 
 #CFLAGS := -ggdb3 -O3 -D__DEBUG
-CFLAGS := -g -O3
+CFLAGS := -g -O3 -DM5_PROFILING
 
 CPPFLAGS := $(CFLAGS)