resources: Add heterosync benchmark

This commit adds in heterosync, a benchmark suite that is used to test
the performance of various synchronization primitives on tightly-coupled
GPU systems.

Change-Id: I2b4f39ee566514a5c8d11e01769a0fc861e4556e
Reviewed-on: https://gem5-review.googlesource.com/c/public/gem5-resources/+/38695
Reviewed-by: Matt Sinclair <mattdsinclair@gmail.com>
Maintainer: Matt Sinclair <mattdsinclair@gmail.com>
Tested-by: Jason Lowe-Power <power.jg@gmail.com>
diff --git a/README.md b/README.md
index 8c8491e..9cd8bbe 100644
--- a/README.md
+++ b/README.md
@@ -463,6 +463,29 @@
 
 <http://dist.gem5.org/dist/v20-1/test-progs/hip-samples/unroll>
 
+# Resource: Heterosync
+
+[Heterosync](https://github.com/mattsinc/heterosync) is a benchmark suite used
+to test the performance of various types of fine-grained synchronization on
+tightly-coupled GPUs. The version in gem5-resources contains only the HIP code.
+
+The README in the heterosync folder details the various synchronization primitives
+and the other command-line arguments for use with heterosync.
+
+## Compilation
+```
+cd src/heterosync
+docker run --rm -v ${PWD}:${PWD} -w ${PWD} -u $UID:$GID gcr.io/gem5-test/gcn-gpu make release-gfx8-apu
+```
+
+The release-gfx8-apu target builds for gfx801, a GCN3-based APU. There are other
+targets (release-gfx8, release) that build for GPU types that are currently unsupported
+in gem5.
+
+## Pre-built binary
+
+<http://dist.gem5.org/dist/develop/test-progs/heterosync/gcn3/allSyncPrims-1kernel>
+
 # Resource: SPEC 2006
 
 The [Standard Performance Evaluation Corporation](
@@ -597,6 +620,7 @@
 (A University of Maryland copyright).
 * **hip-samples**: Consult individual copyright notices of the source file in
 'src/hip-samples/src'
+* **heterosync**: Consult `src/heterosync/LICENSE.txt`
 * **spec 2006**: SPEC CPU 2006 requires purchase of benchmark suite from
 [SPEC](https://www.spec.org/cpu2006/) thus, it cannot be freely distributed.
 Consult individual copyright notices of source files in `src/spec-2006`.
diff --git a/src/heterosync/.gitignore b/src/heterosync/.gitignore
new file mode 100644
index 0000000..ba077a4
--- /dev/null
+++ b/src/heterosync/.gitignore
@@ -0,0 +1 @@
+bin
diff --git a/src/heterosync/LICENSE.txt b/src/heterosync/LICENSE.txt
new file mode 100644
index 0000000..2f3da1a
--- /dev/null
+++ b/src/heterosync/LICENSE.txt
@@ -0,0 +1,46 @@
+University of Illinois/NCSA Open Source License for HeteroSync
+
+Copyright (c) 2018 The Board of Trustees of the University of Illinois
+
+All rights reserved.
+
+Developed by:       Matthew D. Sinclair, Johnathan Alsop, and Professor Sarita V. Adve
+
+                    University of Illinois at Urbana-Champaign
+
+Some of the benchmarks in HeteroSync build upon work originally done by:
+
+					Jeff Stuart and Professor John Owens
+
+					University of California, Davis
+
+                    http://rsim.cs.illinois.edu/heterosync
+
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+with the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+Redistributions of source code must retain the above copyright notice, this
+list of conditions and the following disclaimers.
+
+Redistributions in binary form must reproduce the above copyright notice, this
+list of conditions and the following disclaimers in the documentation and/or
+other materials provided with the distribution.
+
+Neither the names of Matthew Sinclair, Johnathan Alsop, Professor Sarita Adve,
+the University of Illinois at Urbana-Champaign, Jeff Stuart, Professor John Owens,
+the University of California, Davis, nor the names of its contributors may be used
+to endorse or promote products derived from this Software without specific prior
+written permission.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH
+THE SOFTWARE.
diff --git a/src/heterosync/Makefile b/src/heterosync/Makefile
new file mode 100644
index 0000000..a1aaad4
--- /dev/null
+++ b/src/heterosync/Makefile
@@ -0,0 +1,28 @@
+EXECUTABLE := allSyncPrims-1kernel
+HIP_PATH ?= /opt/rocm/hip
+
+SRC_DIR := src
+SRC := $(wildcard $(SRC_DIR)/*.hip.cpp)
+
+BIN_DIR := bin
+
+all: release
+
+release: $(SRC) | $(BIN_DIR)
+	$(HIP_PATH)/bin/hipcc -DGFX9 --amdgpu-target=gfx900 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
+
+# gfx8 has a different number of bits it uses for sleeps, so compile accordingly
+release-gfx8: $(SRC) | $(BIN_DIR)
+	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx803 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
+
+release-gfx8-apu: $(SRC) | $(BIN_DIR)
+	$(HIP_PATH)/bin/hipcc --amdgpu-target=gfx801 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE)
+
+debug: $(SRC) | $(BIN_DIR)
+	$(HIP_PATH)/bin/hipcc -DDEBUG -g -O0 $(SRC) -o $(BIN_DIR)/$(EXECUTABLE).debug
+
+$(BIN_DIR):
+	mkdir -p $@
+
+clean:
+	rm -f $(BIN_DIR)
diff --git a/src/heterosync/README.txt b/src/heterosync/README.txt
new file mode 100644
index 0000000..b6c15e3
--- /dev/null
+++ b/src/heterosync/README.txt
@@ -0,0 +1,135 @@
+These files are provided AS IS, and can be improved in many aspects. While we performed some performance optimization, there is more to be done. We do not claim that this is the most optimal implementation. The code is presented as a representative case of a CUDA and HIP implementations of these workloads only.  It is NOT meant to be interpreted as a definitive answer to how well this application can perform on GPUs, CUDA, or HIP.  If any of you are interested in improving the performance of these benchmarks, please let us know or submit a pull request on GitHub.
+
+BACKGROUND INFORMATION
+----------------------
+
+Structure: All of the HeteroSync microbenchmarks are run from a single main function.  Each of the microbenchmarks has a separate .cu (CUDA) file that contains the code for its lock and unlock functions.  In the HIP version, these files are header files, because of HIP's requirements for compilation.
+
+Contents: The following Synchronization Primitives (SyncPrims) microbenchmarks are included in HeteroSync:
+
+- Centralized Mutexes:
+	1.  Spin Mutex Lock: A fairly standard spin-lock implementation.  It repeatedly tries to obtain the lock.  This version has high contention and a lot of atomic accesses since all TBs are spinning on the same lock variable.
+	2.  Spin Mutex Lock with Backoff: Standard backoff version of a spin lock where they “sleep” for a short period of time between each unsuccessful acquire.  They use a linear backoff instead of exponential backoff.  On the first failed acquire they will “sleep” for I_min; every subsequent failed read will increase the “sleep” time (up to I_max).
+	3.  Fetch-and-Add (FA) Mutex Lock (similar to Ticket/Queue-style Locks): To make their spin lock fair and have a deterministic number of atomic accesses per operation they also implement this queue-style spin lock.  Every TB uses an atomic to get a "ticket" for when they'll get the lock.  The TBs poll the “current ticket” location until their turn arrives (when it does they acquire the lock).  FAMutex uses backoff in the polling section of this lock to reduce contention.
+	4.  Ring Buffer-based Sleeping Mutex Lock: Each TB places itself on the end of the buffer and repeatedly checks if is now at the front of the buffer.  To unlock they increment the head pointer.  In the original paper they found that performance is bad for this one because it requires more reads and writes to the head pointer are serialized.
+- Centralized Semaphores:
+	1.  Spin Lock Semaphore: To approximate the "perform OP if > 0" feature of semaphores (on CPUs) they use atomicExch's to block the TB until the exchange returns true.  Requires more reads and writes on a GPU than a mutex.  Each TB sets the semaphore to the appropriate new values in the post and wait phases depending on the current capacity of the semaphore.
+	2.  Spin Lock Semaphore with Backoff: As with the mutexes, they add a linear backoff to decrease contention.  The backoff is only in the wait() phase because usually more TBs are waiting, not posting.
+- Barriers:
+	1.  Atomic Barrier: a two-stage atomic counter barrier.  There are several versions of this barrier: a tree barrier and a second version that exchanges data locally on a CU before joining the global tree barrier.
+	2.  Lock-Free Barrier: a decentralized, sleeping based approach that doesn't require atomics.  Each TB sets a flag in a distinct memory location.  Once all TBs have set their flag, then each TB does an intra-block barrier between its warps.  Like the atomic barrier, there are two versions.
+
+All microbenchmarks access shared data that requires synchronization.
+
+A subsequent commit will add the Relaxed Atomics microbenchmarks discussed in our paper.
+
+USAGE
+-----
+
+Compilation:
+
+Since all of the microbenchmarks run from a single main function, users only need to compile the entire suite once in order to use any of the microbenchmarks.  You will need to set CUDA_DIR in the Makefile in order to properly compile the code.  To use HIP, you will need to set HIP_PATH for compilation to work correctly.
+
+Running:
+
+The usage of the microbenchmarks is as follows:
+
+./allSyncPrims-1kernel <syncPrim> <numLdSt> <numTBs> <numCSIters>
+
+<syncPrim> is a string that differs for each synchronization primitive to be run:
+	// Barriers use hybrid local-global synchronization
+	- atomicTreeBarrUniq - atomic tree barrier
+	- atomicTreeBarrUniqLocalExch - atomic tree barrier with local exchange
+	- lfTreeBarrUniq - lock-free tree barrier
+	- lfTreeBarrUniqLocalExch - lock-free tree barrier with local exchange
+	// global synchronization versions
+	- spinMutex - spin lock mutex
+	- spinMutexEBO - spin lock mutex with exponential backoff
+	- sleepMutex - decentralized ticket lock
+	- faMutex - centralized ticket lock (aka, fetch-and-add mutex)
+	- spinSem1 - spin lock semaphore, semaphore size 1
+	- spinSem2 - spin lock semaphore, semaphore size 2
+	- spinSem10 - spin lock semaphore, semaphore size 10
+	- spinSem120 - spin lock semaphore, semaphore size 120
+	- spinSemEBO1 - spin lock semaphore with exponential backoff, semaphore size 1
+	- spinSemEBO2 - spin lock semaphore with exponential backoff, semaphore size 2
+	- spinSemEBO10 - spin lock semaphore with exponential backoff, semaphore size 10
+	- spinSemEBO120 - spin lock semaphore with exponential backoff, semaphore size 120
+	// local synchronization versions
+	- spinMutexUniq - local spin lock mutex
+	- spinMutexEBOUniq - local spin lock mutex with exponential backoff
+	- sleepMutexUniq - local decentralized ticket lock
+	- faMutexUniq - local centralized ticket lock
+	- spinSemUniq1 - local spin lock semaphore, semaphore size 1
+	- spinSemUniq2 - local spin lock semaphore, semaphore size 2
+	- spinSemUniq10 - local spin lock semaphore, semaphore size 10
+	- spinSemUniq120 - local spin lock semaphore, semaphore size 120
+	- spinSemEBOUniq1 - local spin lock semaphore with exponential backoff, semaphore size 1
+	- spinSemEBOUniq2 - local spin lock semaphore with exponential backoff, semaphore size 2
+	- spinSemEBOUniq10 - local spin lock semaphore with exponential backoff, semaphore size 10
+	- spinSemEBOUniq120 - local spin lock semaphore with exponential backoff, semaphore size 120
+
+<numLdSt> is a positive integer representing how many loads and stores each thread will perform.  For the mutexes and semaphores, these accesses are all performed in the critical section.  For the barriers, these accesses use barriers to ensure that multiple threads are not accessing the same data.
+
+<numTBs> is a positive integer representing the number of thread blocks (TBs) to execute.  For many of the microbenchmarks (especially the barriers), this number needs to be divisible by the number of SMs on the GPU.
+
+<numCSIters> is a positive integer representing the number of iterations of the critical section.
+
+IISWC '17 VERSION
+-----------------
+
+The version used in our IISWC '17 paper assumes a unified address space between the CPU and GPU.  Thus, it does not require any copies.  Moreover, this version is based on CUDA SDK 3.1 and HIP version 1.6, as this is the last version of CUDA that is fully supported by GPGPU-Sim and gem5, respectively, as of the release.  Later versions of CUDA and HIP allow additional C++ features, which may simplify the code or allow other optimizations.  Finally, this version is designed to run in the DeNovo ecosystem, which simulates a unified address space with multiple CPU cores and GPU CUs using a combination of Simics, GEMS, Garnet, and GPGPU-Sim.  In this ecosystem, we assume a SC-for-DRF style memory consistency model.  SC-for-DRF's ordering requirements are enforced by the epilogues and atomic operations.  We assume that the epilogues will self-invalidate all valid data in the local (L1) caches and flush per-CU/core store buffers to write through or obtain ownership for dirty data.
+
+Similarly, to enforce the appropriate ordering requirements, we assume that the CUDA and HIP atomic operations have specific semantics:
+ 
+Atomic      | Reprogrammed? | Load Acquire | Store Release |  Unpaired  |
+atomicAdd   |               |              |               | X (LD, ST) |
+atomicSub   |               |              |               | X (LD, ST) |
+atomicExch  |      X        |              |      X (ST)   |            |
+atomicMin   |               |              |               | X (LD, ST) |
+atomicMax   |               |              |               | X (LD, ST) |
+atomicInc   |               |              |      X (ST)   |   X (LD)   |
+atomicDec   |               |              |      X (ST)   |   X (LD)   |
+atomicCAS   |               |    X (LD)    |               |   X (ST)   |
+atomicAnd   |      X        |              |               | X (LD, ST) |
+atomicOr    |      X        |              |               | X (LD, ST) |
+atomicXor   |      X        |    X (LD)    |               |            |
+
+If your ecosystem does not make the same assumptions, then you will need to add the appropriate fences (e.g., CUDA's __threadfence() and __threadfence_block()) to ensure the proper ordering of requests in the memory system.  In the case of the HIP implementation, you may be able to use some OpenCL atomics with the desired orderings, but we left it as is to ensure portability and correctness with future versions of HIP that may not support this feature.
+
+Reprogrammed Atomics:
+
+In addition to the above assumptions about semantics for a given atomic, we have also reprogrammed some of the CUDA atomics to provide certain functionality we needed that CUDA doesn't provide:
+
+- atomicAnd() was reprogrammed to have the same functionality as an atomicInc() but without store release semantics (i.e., atomicInc has store release semantics, atomicAnd does not).  We chose atomicAnd() for this because it was not used in any of our applications.  This change was necessary because atomicInc() sometimes needs store release semantics.
+- atomicXor() was reprogrammed to do an atomic load (instead of an atomic RMW).
+- atomicOr() was reprogrammed to do an (unpaired) atomic store (instead of an atomic RMW).  We chose atomicOr for the symmetry with atomicXor and because no applications used it.
+- atomicExch() was not reprogrammed in the simulator, but we have re-purposed it assuming that the value returned by the atomicExch() is never returned or used in the program.  This allows us to treat atomicExch() as if it were an atomic store.  Thus, the programmer should consider an atomicExch() to be an atomic store.  All of the applications we have encountered thus far already did this.  In the simulator, we account for the read on the timing and functional sides.
+
+Instruction-Centric vs. Data-Centric:
+
+Common programming languages like C++ and OpenCL, which use a data-centric approach.  These languages identify atomic accesses by “tagging” a variable with the atomic qualifier.  These languages use an instruction-centric method for identifying which atomic accesses can/should use relaxed atomics instead of SC atomics; the accesses that can be relaxed have “memory_order_relaxed” appended to their accesses.  Since CUDA does not provide support for the same framework as C++ and OpenCL, we had to make a design decision about how to identify atomic accesses and how to identify which of those atomic accesses can use relaxed atomics vs. SC atomics.  We chose to use an instruction-centric method for identifying atomic vs. non-atomic accesses.  In this method, we designate certain CUDA atomic instructions as being load acquires, store releases, or unpaired (as denoted above).  Moreover, note that CUDA does not have direct support for atomic loads or stores.  HIP does support these, but only with OpenCL commands.
+
+CUDA UVM VERSION
+----------------
+
+The CUDA UVM version is based on CUDA SDK 6.0, and uses CUDA's unified virtual memory to avoid making explicit copies of some of the arrays and structures.  Unlike the IISWC '17 version, this version does not make any assumptions about ordering atomics provide.  Nor does it require epilogues.  Instead, it adds the appropriate CUDA fence commands around atomic accesses to ensure the SC-for-DRF ordering is provided.  This version has been tested on a Pascal P100 GPU, but has not been tested as rigorously as the IISWC '17 version.
+
+HIP UVM VERSION
+----------------
+
+The HIP UVM version is based on HIP 1.6, and uses HIP's unified virtual memory to avoid making explicit copies of some of the arrays and structures.  Unlike the IISWC '17 version, this version does not make any assumptions about ordering atomics provide.  Nor does it require epilogues.  Instead, it adds the appropriate HIP fence commands around atomic accesses to ensure the SC-for-DRF ordering is provided.  This version has been tested on a Vega 56 GPU, but has not been tested as rigorously as the IISWC '17 version.
+
+CITATION
+--------
+
+If you publish work that uses these benchmarks, please cite the following papers:
+
+1.  M. D. Sinclair, J. Alsop, and S. V. Adve, HeteroSync: A Benchmark Suite for Fine-Grained Synchronization on Tightly Coupled GPUs, in the IEEE International Symposium on Workload Characterization (IISWC), October 2017
+
+2.  J. A. Stuart and J. D. Owens, “Efficient Synchronization Primitives for GPUs,” CoRR, vol. abs/1110.4623, 2011
+
+ACKNOWLEDGEMENTS
+----------------
+
+This work was supported in part by a Qualcomm Innovation Fellowship for Sinclair, the National Science Foundation under grants CCF 13-02641 and CCF 16-19245, the Center for Future Architectures Research (C-FAR), a Semiconductor Research Corporation program sponsored by MARCO and DARPA, and the Center for Applications Driving Architectures (ADA), one of six centers of JUMP, a Semiconductor Research Corporation program co-sponsored by DARPA.
diff --git a/src/heterosync/src/hipLocks.h b/src/heterosync/src/hipLocks.h
new file mode 100644
index 0000000..690ce4f
--- /dev/null
+++ b/src/heterosync/src/hipLocks.h
@@ -0,0 +1,87 @@
+#ifndef __HIPLOCKS_H__
+#define __HIPLOCKS_H__
+
+// used for calling s_sleep
+extern "C" void __builtin_amdgcn_s_sleep(int);
+
+/*
+  Shared sleep function.  Since s_sleep only takes in consstants (between 1 and 128),
+  need code to handle long tail.
+
+  Currently s_sleep is unsupported in gem5, so sleepFunc is commented out and
+  replaced with a spin in the lock implementations
+ */
+inline __device__ void sleepFunc(int backoff) {
+  int backoffCopy = backoff;
+#ifdef GFX9
+  // max for gfx9 is 127
+  for (int i = 0; i < backoff; i += 127) {
+    __builtin_amdgcn_s_sleep(127);
+    backoffCopy -= 127;
+  }
+#else
+  // max for gfx8 is 15
+  for (int i = 0; i < backoff; i += 15) {
+    __builtin_amdgcn_s_sleep(15);
+    backoffCopy -= 15;
+  }
+#endif
+
+  // handle any additional backoff
+#ifdef GFX9
+  if (backoffCopy > 64) {
+    __builtin_amdgcn_s_sleep(64);
+    backoffCopy -= 64;
+  }
+  if (backoffCopy > 32) {
+    __builtin_amdgcn_s_sleep(32);
+    backoffCopy -= 32;
+  }
+  if (backoffCopy > 16) {
+    __builtin_amdgcn_s_sleep(16);
+    backoffCopy -= 16;
+  }
+#endif
+  if (backoffCopy > 8) {
+    __builtin_amdgcn_s_sleep(8);
+    backoffCopy -= 8;
+  }
+  if (backoffCopy > 4) {
+    __builtin_amdgcn_s_sleep(4);
+    backoffCopy -= 4;
+  }
+  if (backoffCopy > 2) {
+    __builtin_amdgcn_s_sleep(2);
+    backoffCopy -= 2;
+  }
+  if (backoffCopy > 1) {
+    __builtin_amdgcn_s_sleep(1);
+    backoffCopy -= 1;
+  }
+}
+
+typedef struct hipLockData
+{
+  int maxBufferSize;
+  int arrayStride;
+  int mutexCount;
+  int semaphoreCount;
+
+  unsigned int * barrierBuffers;
+  int * mutexBuffers;
+  unsigned int * mutexBufferHeads;
+  unsigned int * mutexBufferTails;
+  unsigned int * semaphoreBuffers;
+} hipLockData_t;
+
+typedef unsigned int hipMutex_t;
+typedef unsigned int hipSemaphore_t;
+
+static hipLockData_t * cpuLockData;
+
+hipError_t hipLocksInit(const int maxBlocksPerKernel, const int numMutexes,
+                        const int numSemaphores, const bool pageAlign,
+                        const int NUM_CU);
+hipError_t hipLocksDestroy();
+
+#endif
diff --git a/src/heterosync/src/hipLocksBarrier.h b/src/heterosync/src/hipLocksBarrier.h
new file mode 100644
index 0000000..dd11d08
--- /dev/null
+++ b/src/heterosync/src/hipLocksBarrier.h
@@ -0,0 +1,7 @@
+#ifndef __HIPLOCKSBARRIER_H__
+#define __HIPLOCKSBARRIER_H__
+
+#include "hipLocksBarrierAtomic.h"
+#include "hipLocksBarrierFast.h"
+
+#endif
diff --git a/src/heterosync/src/hipLocksBarrierAtomic.h b/src/heterosync/src/hipLocksBarrierAtomic.h
new file mode 100644
index 0000000..a51a77f
--- /dev/null
+++ b/src/heterosync/src/hipLocksBarrierAtomic.h
@@ -0,0 +1,184 @@
+#ifndef __HIPLOCKSBARRIERATOMIC_H__
+#define __HIPLOCKSBARRIERATOMIC_H__
+
+#include "hip/hip_runtime.h"
+#include "hipLocks.h"
+
+inline __device__ void hipBarrierAtomicSub(unsigned int * globalBarr,
+                                            int * done,
+                                            // numBarr represents the number of
+                                            // WGs going to the barrier
+                                            const unsigned int numBarr,
+                                            int backoff,
+                                            const bool isMasterThread)
+{
+  __syncthreads();
+  if (isMasterThread)
+  {
+    *done = 0;
+
+    // atomicInc acts as a store release, need TF to enforce ordering
+    __threadfence();
+    // atomicInc effectively adds 1 to atomic for each WG that's part of the
+    // global barrier.
+    /*
+      HIP currently doesn't generate the correct code for atomicInc's here,
+      so replace with an atomicAdd of 1 and assume no wraparound
+    */
+    //atomicInc(globalBarr, 0x7FFFFFFF);
+    atomicAdd(globalBarr, 1);
+  }
+  __syncthreads();
+
+  while (!*done)
+  {
+    if (isMasterThread)
+    {
+      /*
+        For the tree barrier we expect only 1 WG from each CU to enter the
+        global barrier.  Since we are assuming an equal amount of work for all
+        CUs, we can use the # of WGs reaching the barrier for the compare value
+        here.  Once the atomic's value == numBarr, then reset the value to 0 and
+        proceed because all of the WGs have reached the global barrier.
+      */
+      if (atomicCAS(globalBarr, numBarr, 0) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering
+        __threadfence();
+        *done = 1;
+      }
+      else { // increase backoff to avoid repeatedly hammering global barrier
+        // (capped) exponential backoff
+        backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
+      }
+    }
+    __syncthreads();
+
+    // do exponential backoff to reduce the number of times we pound the global
+    // barrier
+    if (!*done) {
+      //sleepFunc(backoff);
+      for (int j = 0; j < backoff; ++j) { ; }
+      __syncthreads();
+    }
+  }
+}
+
+inline __device__ void hipBarrierAtomic(unsigned int * barrierBuffers,
+                                         // numBarr represents the number of
+                                         // WGs going to the barrier
+                                         const unsigned int numBarr,
+                                         const bool isMasterThread)
+{
+  unsigned int * atomic1 = barrierBuffers;
+  unsigned int * atomic2 = atomic1 + 1;
+  __shared__ int done1, done2;
+  __shared__ int backoff;
+
+  if (isMasterThread) {
+    backoff = 1;
+  }
+  __syncthreads();
+
+  hipBarrierAtomicSub(atomic1, &done1, numBarr, backoff, isMasterThread);
+  // second barrier is necessary to provide a facesimile for a sense-reversing
+  // barrier
+  hipBarrierAtomicSub(atomic2, &done2, numBarr, backoff, isMasterThread);
+}
+
+// does local barrier amongst all of the WGs on an CU
+inline __device__ void hipBarrierAtomicSubLocal(unsigned int * perCUBarr,
+                                                 int * done,
+                                                 const unsigned int numWGs_thisCU,
+                                                 const bool isMasterThread)
+{
+  __syncthreads();
+  if (isMasterThread)
+  {
+    *done = 0;
+    // atomicInc acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    /*
+      atomicInc effectively adds 1 to atomic for each WG that's part of the
+      barrier.  For the local barrier, this requires using the per-CU
+      locations.
+    */
+    /*
+      HIP currently doesn't generate the correct code for atomicInc's here,
+      so replace with an atomicAdd of 1 and assume no wraparound
+    */
+    //atomicInc(perCUBarr, 0x7FFFFFFF);
+    atomicAdd(perCUBarr, 1);
+  }
+  __syncthreads();
+
+  while (!*done)
+  {
+    if (isMasterThread)
+    {
+      /*
+        Once all of the WGs on this CU have incremented the value at atomic,
+        then the value (for the local barrier) should be equal to the # of WGs
+        on this CU.  Once that is true, then we want to reset the atomic to 0
+        and proceed because all of the WGs on this CU have reached the local
+        barrier.
+      */
+      if (atomicCAS(perCUBarr, numWGs_thisCU, 0) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering
+        // locally
+        __threadfence_block();
+        *done = 1;
+      }
+    }
+    __syncthreads();
+  }
+}
+
+// does local barrier amongst all of the WGs on an CU
+inline __device__ void hipBarrierAtomicLocal(unsigned int * perCUBarrierBuffers,
+                                              const unsigned int cuID,
+                                              const unsigned int numWGs_thisCU,
+                                              const bool isMasterThread,
+                                              const int MAX_BLOCKS)
+{
+  // each CU has MAX_BLOCKS locations in barrierBuffers, so my CU's locations
+  // start at barrierBuffers[cuID*MAX_BLOCKS]
+  unsigned int * atomic1 = perCUBarrierBuffers + (cuID * MAX_BLOCKS);
+  unsigned int * atomic2 = atomic1 + 1;
+  __shared__ int done1, done2;
+
+  hipBarrierAtomicSubLocal(atomic1, &done1, numWGs_thisCU, isMasterThread);
+  // second barrier is necessary to approproximate a sense-reversing barrier
+  hipBarrierAtomicSubLocal(atomic2, &done2, numWGs_thisCU, isMasterThread);
+}
+
+/*
+  Helper function for joining the barrier with the atomic tree barrier.
+*/
+__attribute__((always_inline)) __device__ void joinBarrier_helper(unsigned int * barrierBuffers,
+                                                                  unsigned int * perCUBarrierBuffers,
+                                                                  const unsigned int numBlocksAtBarr,
+                                                                  const int cuID,
+                                                                  const int perCU_blockID,
+                                                                  const int numWGs_perCU,
+                                                                  const bool isMasterThread,
+                                                                  const int MAX_BLOCKS) {
+  if (numWGs_perCU > 1) {
+    hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
+                           isMasterThread, MAX_BLOCKS);
+
+    // only 1 WG per CU needs to do the global barrier since we synchronized
+    // the WGs locally first
+    if (perCU_blockID == 0) {
+      hipBarrierAtomic(barrierBuffers, numBlocksAtBarr, isMasterThread);
+    }
+
+    // all WGs on this CU do a local barrier to ensure global barrier is
+    // reached
+    hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
+                           isMasterThread, MAX_BLOCKS);
+  } else { // if only 1 WG on the CU, no need for the local barriers
+    hipBarrierAtomic(barrierBuffers, numBlocksAtBarr, isMasterThread);
+  }
+}
+
+#endif
diff --git a/src/heterosync/src/hipLocksBarrierFast.h b/src/heterosync/src/hipLocksBarrierFast.h
new file mode 100644
index 0000000..eaa9837
--- /dev/null
+++ b/src/heterosync/src/hipLocksBarrierFast.h
@@ -0,0 +1,335 @@
+#ifndef __HIPLOCKSBARRIERFAST_H__
+#define __HIPLOCKSBARRIERFAST_H__
+
+#include "hip/hip_runtime.h"
+#include "hipLocks.h"
+
+/*
+  Helper function to set the passed in inVars flag to 1 (signifies that this WG
+  has joined the barrier).
+ */
+inline __device__ void setMyInFlag(unsigned int * inVars,
+                                   const unsigned int threadID,
+                                   const unsigned int blockID) {
+  if (threadID == 0)
+  {
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+    atomicExch((unsigned int *)(inVars + blockID), 1);
+  }
+  __syncthreads();
+}
+
+/*
+  Helper function for the main WG of this group to spin, checking to see if
+  all other WGs joining this barrier have joined or not.
+ */
+inline __device__ void spinOnInFlags(unsigned int * inVars,
+                                     const int threadID,
+                                     const int numThreads,
+                                     const int numBlocks) {
+  // local variables
+  int done3 = 1;
+
+  // "main" WG loops, checking if everyone else has joined the barrier.
+  do
+  {
+    done3 = 1;
+
+    /*
+      Each thread in the main WG accesses a subset of the blocks, checking
+      if they have joined the barrier yet or not.
+    */
+    for (int i = threadID; i < numBlocks; i += numThreads)
+    {
+      if (atomicAdd(&(inVars[i]), 0) != 1) {
+        // acts as a load acquire, need TF to enforce ordering
+        __threadfence();
+
+        done3 = 0;
+        // if one of them isn't ready, don't bother checking the others (just
+        // increases traffic)
+        break;
+      }
+    }
+  } while (!done3);
+  /*
+    When all the necessary WGs have joined the barrier, the threads will
+    reconverge here -- this avoids unnecessary atomic accesses for threads
+    whose assigned WGs have already joined the barrier.
+  */
+  __syncthreads();
+}
+
+/*
+  Helper function for the main WG of this group to spin, checking to see if
+  all other WGs joining this barrier have joined or not.
+*/
+inline __device__ void spinOnInFlags_local(unsigned int * inVars,
+                                           const int threadID,
+                                           const int numThreads,
+                                           const int numBlocks) {
+  // local variables
+  int done3 = 1;
+
+  // "main" WG loops, checking if everyone else has joined the barrier.
+  do
+  {
+    done3 = 1;
+
+    /*
+      Each thread in the main WG accesses a subset of the blocks, checking
+      if they have joined the barrier yet or not.
+    */
+    for (int i = threadID; i < numBlocks; i += numThreads)
+    {
+      if (atomicAdd(&(inVars[i]), 0) != 1) {
+        // acts as a load acquire, need TF to enforce ordering locally
+        __threadfence_block();
+
+        done3 = 0;
+        // if one of them isn't ready, don't bother checking the others (just
+        // increases traffic)
+        break;
+      }
+    }
+  } while (!done3);
+  /*
+    When all the necessary WGs have joined the barrier, the threads will
+    reconverge here -- this avoids unnecessary atomic accesses for threads
+    whose assigned WGs have already joined the barrier.
+  */
+  __syncthreads();
+}
+
+/*
+  Helper function for main WG to set the outVars flags for all WGs at this
+  barrier to notify them that everyone has joined the barrier and they can
+  proceed.
+*/
+inline __device__ void setOutFlags(unsigned int * inVars,
+                                   unsigned int * outVars,
+                                   const int threadID,
+                                   const int numThreads,
+                                   const int numBlocks) {
+  for (int i = threadID; i < numBlocks; i += numThreads)
+  {
+    atomicExch(&(inVars[i]), 0);
+    atomicExch(&(outVars[i]), 1);
+  }
+  __syncthreads();
+  // outVars acts as a store release, need TF to enforce ordering
+  __threadfence();
+}
+
+/*
+  Helper function for main WG to set the outVars flags for all WGs at this
+  barrier to notify them that everyone has joined the barrier and they can
+  proceed.
+*/
+inline __device__ void setOutFlags_local(unsigned int * inVars,
+                                         unsigned int * outVars,
+                                         const int threadID,
+                                         const int numThreads,
+                                         const int numBlocks) {
+  for (int i = threadID; i < numBlocks; i += numThreads)
+  {
+    atomicExch(&(inVars[i]), 0);
+    atomicExch(&(outVars[i]), 1);
+  }
+  __syncthreads();
+  // outVars acts as a store release, need TF to enforce ordering locally
+  __threadfence_block();
+}
+
+/*
+  Helper function for each WG to spin waiting for its outVars flag to be set
+  by the main WG.  When it is set, then this WG can safely exit the barrier.
+*/
+inline __device__ void spinOnMyOutFlag(unsigned int * inVars,
+                                       unsigned int * outVars,
+                                       const int blockID,
+                                       const int threadID) {
+  if (threadID == 0)
+  {
+    while (atomicAdd(&(outVars[blockID]), 0) != 1) { ; }
+
+    atomicExch(&(inVars[blockID]), 0);
+    atomicExch(&(outVars[blockID]), 0);
+    // these stores act as a store release, need TF to enforce ordering
+    __threadfence();
+  }
+  __syncthreads();
+}
+
+/*
+  Helper function for each WG to spin waiting for its outVars flag to be set
+  by the main WG.  When it is set, then this WG can safely exit the barrier.
+*/
+inline __device__ void spinOnMyOutFlag_local(unsigned int * inVars,
+                                             unsigned int * outVars,
+                                             const int blockID,
+                                             const int threadID) {
+  if (threadID == 0)
+  {
+    while (atomicAdd(&(outVars[blockID]), 0) != 1) { ; }
+
+    atomicExch(&(inVars[blockID]), 0);
+    atomicExch(&(outVars[blockID]), 0);
+    // these stores act as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+  }
+  __syncthreads();
+}
+
+__device__ void hipBarrier(unsigned int * barrierBuffers,
+                           const int arrayStride,
+                           const unsigned int numBlocksAtBarr)
+{
+  // local variables
+  const int threadID = hipThreadIdx_x;
+  const int blockID = hipBlockIdx_x;
+  const int numThreads = hipBlockDim_x;
+  // ** NOTE: setting numBlocks like this only works if the first WG on
+  // each CU joins the global barrier
+  const int numBlocks = numBlocksAtBarr;
+  unsigned int * const inVars  = barrierBuffers;
+  unsigned int * const outVars = barrierBuffers + arrayStride;
+
+  /*
+    Thread 0 from each WG sets its 'private' flag in the in array to 1 to
+    signify that it has joined the barrier.
+  */
+  setMyInFlag(inVars, threadID, blockID);
+
+  // WG 0 is the "main" WG for the global barrier
+  if (blockID == 0)
+  {
+    // "main" WG loops, checking if everyone else has joined the barrier.
+    spinOnInFlags(inVars, threadID, numThreads, numBlocks);
+
+    /*
+      Once all the WGs arrive at the barrier, the main WG resets them to
+      notify everyone else that they can move forward beyond the barrier --
+      again each thread in the main WG takes a subset of the necessary WGs
+      and sets their in flag to 0 and out flag to 1.
+    */
+    setOutFlags(inVars, outVars, threadID, numThreads, numBlocks);
+  }
+
+  /*
+    All WGs (including the main one) spin, checking to see if the main one
+    set their out location yet -- if it did, then they can move ahead
+    because the barrier is done.
+  */
+  spinOnMyOutFlag(inVars, outVars, blockID, threadID);
+}
+
+// same algorithm but per-CU synchronization
+__device__ void hipBarrierLocal(// for global barrier
+                                unsigned int * barrierBuffers,
+                                const unsigned int numBlocksAtBarr,
+                                const int arrayStride,
+                                // for local barrier
+                                unsigned int * perCUBarrierBuffers,
+                                const unsigned int cuID,
+                                const unsigned int numWGs_perCU,
+                                const unsigned int perCU_blockID,
+                                const bool isLocalGlobalBarr,
+                                const int MAX_BLOCKS)
+{
+  // local variables
+  const int threadID = hipThreadIdx_x;
+  const int numThreads = hipBlockDim_x;
+  const int numBlocks = numWGs_perCU;
+  /*
+    Each CU has MAX_BLOCKS*2 locations in perCUBarrierBuffers, so my CU's
+    inVars locations start at perCUBarrierBuffers[cuID*2*MAX_BLOCKS] and my
+    CU's outVars locations start at
+    perCUBarrierBuffers[cuID*2*MAX_BLOCKS + MAX_BLOCKS].
+  */
+  unsigned int * const inVars  = perCUBarrierBuffers + (MAX_BLOCKS * cuID * 2);
+  unsigned int * const outVars = perCUBarrierBuffers + ((MAX_BLOCKS * cuID * 2) + MAX_BLOCKS);
+
+  /*
+    Thread 0 from each WG sets its 'private' flag in the in array to 1 to
+    signify that it has joined the barrier.
+  */
+  setMyInFlag(inVars, threadID, perCU_blockID);
+
+  // first WG on this CU is the "main" WG for the local barrier
+  if (perCU_blockID == 0)
+  {
+    // "main" WG loops, checking if everyone else has joined the barrier.
+    spinOnInFlags_local(inVars, threadID, numThreads, numBlocks);
+
+    /*
+      If we are calling the global tree barrier from within the local tree
+      barrier, call it here.  Now that all of the WGs on this CU have joined
+      the local barrier, WG 0 on this CU joins the global barrier.
+    */
+    if (isLocalGlobalBarr) {
+      hipBarrier(barrierBuffers, arrayStride, numBlocksAtBarr);
+    }
+
+    /*
+      Once all the WGs arrive at the barrier, the main WG resets their inVar
+      and sets their outVar to notify everyone else that they can move
+      forward beyond the barrier -- each thread in the main WG takes a subset
+      of the necessary WGs and sets their in flag to 0 and out flag to 1.
+    */
+    setOutFlags_local(inVars, outVars, threadID, numThreads, numBlocks);
+  }
+
+  /*
+    All WGs (including the main one) spin, checking to see if the main WG
+    set their out location yet -- if it did, then they can move ahead
+    because the barrier is done.
+  */
+  spinOnMyOutFlag_local(inVars, outVars, perCU_blockID, threadID);
+}
+
+/*
+  Decentralized tree barrier that has 1 WG per CU join the global decentralized
+  barrier in the middle, then sets the out flags of the others on this CU to 1
+  after returning.  This avoids the need for a second local barrier after the
+  global barrier.
+*/
+__device__ void hipBarrierLocalGlobal(// for global barrier
+                                      unsigned int * barrierBuffers,
+                                      const unsigned int numBlocksAtBarr,
+                                      const int arrayStride,
+                                      // for local barrier
+                                      unsigned int * perCUBarrierBuffers,
+                                      const unsigned int cuID,
+                                      const unsigned int numWGs_perCU,
+                                      const unsigned int perCU_blockID,
+                                      const int MAX_BLOCKS)
+{
+  // will call global barrier within it
+  hipBarrierLocal(barrierBuffers, numBlocksAtBarr, arrayStride,
+                  perCUBarrierBuffers, cuID, numWGs_perCU, perCU_blockID,
+                  true, MAX_BLOCKS);
+}
+
+/*
+  Helper function for joining the barrier with the 'lock-free' tree barrier.
+*/
+__device__ void joinLFBarrier_helper(unsigned int * barrierBuffers,
+                                     unsigned int * perCUBarrierBuffers,
+                                     const unsigned int numBlocksAtBarr,
+                                     const int cuID,
+                                     const int perCU_blockID,
+                                     const int numWGs_perCU,
+                                     const int arrayStride,
+                                     const int MAX_BLOCKS) {
+  if (numWGs_perCU > 1) {
+    hipBarrierLocalGlobal(barrierBuffers, numBlocksAtBarr, arrayStride,
+                          perCUBarrierBuffers, cuID, numWGs_perCU,
+                          perCU_blockID, MAX_BLOCKS);
+  } else { // if only 1 WG on the CU, no need for the local barriers
+    hipBarrier(barrierBuffers, arrayStride, numBlocksAtBarr);
+  }
+}
+
+#endif
diff --git a/src/heterosync/src/hipLocksImpl.h b/src/heterosync/src/hipLocksImpl.h
new file mode 100644
index 0000000..b04fb37
--- /dev/null
+++ b/src/heterosync/src/hipLocksImpl.h
@@ -0,0 +1,93 @@
+#include "hipLocks.h"
+
+hipError_t hipLocksInit(const int maxWGsPerKernel, const int numMutexes,
+                        const int numSemaphores, const bool pageAlign,
+                        const int NUM_CU, const int NUM_REPEATS,
+                        const int NUM_ITERS)
+{
+  hipError_t hipErr = hipGetLastError();
+  checkError(hipErr, "Start hipLocksInit");
+
+  hipHostMalloc(&cpuLockData, sizeof(hipLockData_t));
+
+  if (maxWGsPerKernel <= 0)       return hipErrorInitializationError;
+  if (numMutexes <= 0)            return hipErrorInitializationError;
+  if (numSemaphores <= 0)         return hipErrorInitializationError;
+
+  // initialize some of the lock data's values
+  /*
+    Since HIP doesn't generate the correct code for atomicInc's, this
+    means wraparound is not handled properly.  However, since in the current
+    version each subsequent kernel launch starts in the ring buffer where
+    the last kernel left off, this eventually leads to wraparound.  Increase
+    buffer size to prevent wraparound and hide this.
+  */
+  cpuLockData->maxBufferSize          = maxWGsPerKernel * NUM_REPEATS * NUM_ITERS;
+  cpuLockData->arrayStride            = (cpuLockData->maxBufferSize + NUM_CU) /
+                                          NUM_WORDS_PER_CACHELINE * NUM_WORDS_PER_CACHELINE;
+  cpuLockData->mutexCount             = numMutexes;
+  cpuLockData->semaphoreCount         = numSemaphores;
+
+  hipMalloc(&cpuLockData->barrierBuffers,   sizeof(unsigned int) * cpuLockData->arrayStride * 2);
+
+  hipMalloc(&cpuLockData->mutexBuffers,     sizeof(int) * cpuLockData->arrayStride * cpuLockData->mutexCount);
+  hipMalloc(&cpuLockData->mutexBufferHeads, sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipMalloc(&cpuLockData->mutexBufferTails, sizeof(unsigned int) * cpuLockData->mutexCount);
+
+  hipMalloc(&cpuLockData->semaphoreBuffers, sizeof(unsigned int) * 4 * cpuLockData->semaphoreCount);
+
+  hipErr = hipGetLastError();
+  checkError(hipErr, "Before memsets");
+
+  hipDeviceSynchronize();
+
+  hipMemset(cpuLockData->barrierBuffers, 0,
+            sizeof(unsigned int) * cpuLockData->arrayStride * 2);
+
+  hipMemset(cpuLockData->mutexBufferHeads, 0,
+            sizeof(unsigned int) * cpuLockData->mutexCount);
+  hipMemset(cpuLockData->mutexBufferTails, 0,
+            sizeof(unsigned int) * cpuLockData->mutexCount);
+
+  /*
+    initialize mutexBuffers to appropriate values
+
+    set the first location for each CU to 1 so that the ring buffer can be
+    used by the first WG right away (otherwise livelock because no locations
+    ever == 1)
+
+    for all other locations initialize to -1 so WGs for these locations
+    don't think it's their turn right away
+
+    since hipMemset sets everything in bytes, initialize all to 0 first
+  */
+  hipMemset(&(cpuLockData->mutexBuffers[0]), 0,
+            cpuLockData->arrayStride * cpuLockData->mutexCount * sizeof(int));
+  for (int i = 0; i < (cpuLockData->arrayStride * cpuLockData->mutexCount);
+       i += cpuLockData->arrayStride) {
+    hipMemset(&(cpuLockData->mutexBuffers[i]), 0x0001, 1);
+    hipMemset(&(cpuLockData->mutexBuffers[i + 1]), -1,
+              (cpuLockData->arrayStride - 1) * sizeof(int));
+  }
+
+  hipMemset(cpuLockData->semaphoreBuffers, 0,
+            sizeof(unsigned int) * cpuLockData->semaphoreCount * 4);
+
+  hipDeviceSynchronize();
+
+  return hipSuccess;
+}
+
+hipError_t hipLocksDestroy()
+{
+  if (cpuLockData == NULL) { return hipErrorInitializationError; }
+  hipFree(cpuLockData->mutexBuffers);
+  hipFree(cpuLockData->mutexBufferHeads);
+  hipFree(cpuLockData->mutexBufferTails);
+
+  hipFree(cpuLockData->semaphoreBuffers);
+
+  hipHostFree(cpuLockData);
+
+  return hipSuccess;
+}
diff --git a/src/heterosync/src/hipLocksMutex.h b/src/heterosync/src/hipLocksMutex.h
new file mode 100644
index 0000000..638602b
--- /dev/null
+++ b/src/heterosync/src/hipLocksMutex.h
@@ -0,0 +1,10 @@
+#ifndef __HIPLOCKSMUTEX_H__
+#define __HIPLOCKSMUTEX_H__
+
+#include "hipLocks.h"
+#include "hipLocksMutexSpin.h"
+#include "hipLocksMutexEBO.h"
+#include "hipLocksMutexFA.h"
+#include "hipLocksMutexSleep.h"
+
+#endif
diff --git a/src/heterosync/src/hipLocksMutexEBO.h b/src/heterosync/src/hipLocksMutexEBO.h
new file mode 100644
index 0000000..0adaac0
--- /dev/null
+++ b/src/heterosync/src/hipLocksMutexEBO.h
@@ -0,0 +1,128 @@
+#ifndef __HIPLOCKSMUTEXEBO_H__
+#define __HIPLOCKSMUTEXEBO_H__
+
+#include "hip/hip_runtime.h"
+#include "hipLocks.h"
+
+inline __host__ hipError_t hipMutexCreateEBO(hipMutex_t * const handle,
+                                             const int mutexNumber)
+{
+  *handle = mutexNumber;
+  return hipSuccess;
+}
+
+inline __device__ void hipMutexEBOLock(const hipMutex_t mutex,
+                                       unsigned int * mutexBufferHeads,
+                                       const int NUM_CU)
+{
+  // local variables
+  __shared__ int done, backoff;
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  unsigned int * mutexHeadPtr = NULL;
+
+  if (isMasterThread)
+  {
+    backoff = 1;
+    done = 0;
+    mutexHeadPtr = (mutexBufferHeads + (mutex * NUM_CU));
+  }
+  __syncthreads();
+  while (!done)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      // try to acquire the lock
+      if (atomicCAS(mutexHeadPtr, 0, 1) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering
+        __threadfence();
+        done = 1;
+      }
+      else
+      {
+        // if we failed in acquiring the lock, wait for a little while before
+        // trying again
+        //sleepFunc(backoff);
+        for (int j = 0; j < backoff; ++j) { ; }
+        // (capped) exponential backoff
+        backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
+      }
+    }
+    __syncthreads();
+  }
+}
+
+inline __device__ void hipMutexEBOUnlock(const hipMutex_t mutex,
+                                         unsigned int * mutexBufferHeads,
+                                         const int NUM_CU)
+{
+  __syncthreads();
+  if (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0) {
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+    atomicExch(mutexBufferHeads + (mutex * NUM_CU), 0); // release the lock
+  }
+  __syncthreads();
+}
+
+// same locking algorithm but with local scope
+inline __device__ void hipMutexEBOLockLocal(const hipMutex_t mutex,
+                                            const unsigned int cuID,
+                                            unsigned int * mutexBufferHeads,
+                                            const int NUM_CU)
+{
+  // local variables
+  __shared__ int done, backoff;
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  unsigned int * mutexHeadPtr = NULL;
+
+  if (isMasterThread)
+  {
+    backoff = 1;
+    done = 0;
+    mutexHeadPtr = (mutexBufferHeads + ((mutex * NUM_CU) + cuID));
+  }
+  __syncthreads();
+  while (!done)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      // try to acquire the lock
+      if (atomicCAS(mutexHeadPtr, 0, 1) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering locally
+        __threadfence_block();
+        done = 1;
+      }
+      else
+      {
+        // if we failed in acquiring the lock, wait for a little while before
+        // trying again
+        //sleepFunc(backoff);
+        for (int j = 0; j < backoff; ++j) { ; }
+        // (capped) exponential backoff
+        backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
+      }
+    }
+    __syncthreads();
+  }
+}
+
+// same unlock algorithm but with local scope
+inline __device__ void hipMutexEBOUnlockLocal(const hipMutex_t mutex,
+                                              const unsigned int cuID,
+                                              unsigned int * mutexBufferHeads,
+                                              const int NUM_CU)
+{
+  __syncthreads();
+  if (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0) {
+    // atomicExch acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    atomicExch(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0); // release the lock
+  }
+  __syncthreads();
+}
+
+#endif
diff --git a/src/heterosync/src/hipLocksMutexFA.h b/src/heterosync/src/hipLocksMutexFA.h
new file mode 100644
index 0000000..8a6428b
--- /dev/null
+++ b/src/heterosync/src/hipLocksMutexFA.h
@@ -0,0 +1,146 @@
+#ifndef __HIPLOCKSMUTEXFA_H__
+#define __HIPLOCKSMUTEXFA_H__
+
+#include "hip/hip_runtime.h"
+#include "hipLocks.h"
+
+inline __host__ hipError_t hipMutexCreateFA(hipMutex_t * const handle,
+                                            const int mutexNumber)
+{
+  *handle = mutexNumber;
+  return hipSuccess;
+}
+
+inline __device__ void hipMutexFALock(const hipMutex_t mutex,
+                                      unsigned int * mutexBufferHeads,
+                                      unsigned int * mutexBufferTails,
+                                      const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  __shared__ unsigned int myTicketNum;
+  __shared__ bool haveLock;
+  const unsigned int maxTurnNum = 1000000000;
+
+  unsigned int * ticketNumber = mutexBufferHeads + (mutex * NUM_CU);
+  unsigned int * turnNumber =
+      (unsigned int * )mutexBufferTails + (mutex * NUM_CU);
+
+  __syncthreads();
+  if (isMasterThread)
+  {
+    // load below provides ordering, no TF needed
+    myTicketNum = atomicInc(ticketNumber, maxTurnNum);
+    haveLock = false;
+  }
+  __syncthreads();
+  while (!haveLock)
+  {
+    if (isMasterThread)
+    {
+      unsigned int currTicketNum = atomicAdd(turnNumber, 0);
+
+      // it's my turn, I get the lock now
+      if (currTicketNum == myTicketNum) {
+        // above acts as a load acquire, so need TF to enforce ordering
+        __threadfence();
+        haveLock = true;
+      }
+    }
+    __syncthreads();
+  }
+}
+
+inline __device__ void hipMutexFAUnlock(const hipMutex_t mutex,
+                                        unsigned int * mutexBufferTails,
+                                        const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  const unsigned int maxTurnNum = 1000000000;
+  unsigned int * turnNumber = mutexBufferTails + (mutex * NUM_CU);
+
+  __syncthreads();
+  if (isMasterThread) {
+    // atomicInc acts as a store release, need TF to enforce ordering
+    __threadfence();
+    /*
+      HIP currently doesn't generate the correct code for atomicInc's here,
+      so replace with an atomicAdd of 1 and assume no wraparound
+    */
+    //atomicInc(turnNumber, maxTurnNum);
+    atomicAdd(turnNumber, 1);
+  }
+  __syncthreads();
+}
+
+// same algorithm but uses per-CU lock
+inline __device__ void hipMutexFALockLocal(const hipMutex_t mutex,
+                                           const unsigned int cuID,
+                                           unsigned int * mutexBufferHeads,
+                                           unsigned int * mutexBufferTails,
+                                           const int NUM_CU)
+{
+  // local variables
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  __shared__ unsigned int myTicketNum;
+  __shared__ bool haveLock;
+  const unsigned int maxTurnNum = 100000000;
+
+  unsigned int * ticketNumber = mutexBufferHeads + ((mutex * NUM_CU) +
+                                                          cuID);
+  unsigned int * turnNumber =
+      (unsigned int *)mutexBufferTails + ((mutex * NUM_CU) + cuID);
+
+  __syncthreads();
+  if (isMasterThread)
+  {
+    myTicketNum = atomicInc(ticketNumber, maxTurnNum);
+    haveLock = false;
+  }
+  __syncthreads();
+  while (!haveLock)
+  {
+    if (isMasterThread)
+    {
+      unsigned int currTicketNum = atomicAdd(turnNumber, 0);
+
+      // it's my turn, I get the lock now
+      if (currTicketNum == myTicketNum) {
+        // above acts as a load acquire, so need TF to enforce ordering locally
+        __threadfence_block();
+        haveLock = true;
+      }
+    }
+    __syncthreads();
+  }
+}
+
+// same algorithm but uses per-CU lock
+inline __device__ void hipMutexFAUnlockLocal(const hipMutex_t mutex,
+                                             const unsigned int cuID,
+                                             unsigned int * mutexBufferTails,
+                                             const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  const unsigned int maxTurnNum = 100000000;
+
+  unsigned int * turnNumber = mutexBufferTails + ((mutex * NUM_CU) + cuID);
+
+  __syncthreads();
+  if (isMasterThread) {
+    // atomicInc acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    /*
+      HIP currently doesn't generate the correct code for atomicInc's here,
+      so replace with an atomicAdd of 1 and assume no wraparound
+    */
+    //atomicInc(turnNumber, maxTurnNum);
+    atomicAdd(turnNumber, 1);
+  }
+  __syncthreads();
+}
+
+#endif
diff --git a/src/heterosync/src/hipLocksMutexSleep.h b/src/heterosync/src/hipLocksMutexSleep.h
new file mode 100644
index 0000000..b9a1461
--- /dev/null
+++ b/src/heterosync/src/hipLocksMutexSleep.h
@@ -0,0 +1,221 @@
+#ifndef __HIPLOCKMUTEXSLEEP_H__
+#define __HIPLOCKMUTEXSLEEP_H__
+
+#include "hip/hip_runtime.h"
+#include "hipLocks.h"
+
+inline __host__ hipError_t hipMutexCreateSleep(hipMutex_t * const handle,
+                                               const int mutexNumber)
+{
+  *handle = mutexNumber;
+  return hipSuccess;
+}
+
+/*
+  Instead of constantly pounding an atomic to try and lock the mutex, we simply
+  put ourselves into a ring buffer. Then we check our location in the ring
+  buffer to see if it's been set to 1 -- when it has, it is our turn.  When
+  we're done, unset our location and set the next location to 1.
+
+  locks the mutex. must be called by the entire WG.
+*/
+__device__ unsigned int hipMutexSleepLock(const hipMutex_t mutex,
+                                          int * mutexBuffers,
+                                          unsigned int * mutexBufferTails,
+                                          const int maxRingBufferSize,
+                                          const int arrayStride,
+                                          const int NUM_CU)
+{
+  __syncthreads();
+
+  // local variables
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+
+  unsigned int * const ringBufferTailPtr = mutexBufferTails + (mutex * NUM_CU);
+  int * const ringBuffer = (int *)mutexBuffers + (mutex * NUM_CU) * arrayStride;
+
+  __shared__ unsigned int myRingBufferLoc;
+  __shared__ bool haveLock;
+  __shared__ int backoff;
+
+  // this is a fire-and-forget atomic.
+  if (isMasterThread)
+  {
+    /*
+      Don't need store release semantics -- the atomicAdd below determines
+      the happens-before ordering here.
+    */
+    /*
+      HIP currently doesn't generate the correct code for atomicInc's,
+      so replace with an atomicAdd of 1 and assume no wraparound
+    */
+    //myRingBufferLoc = atomicInc(ringBufferTailPtr, maxRingBufferSize);
+    myRingBufferLoc = atomicAdd(ringBufferTailPtr, 1);
+
+    haveLock = false; // initially we don't have the lock
+    backoff = 1;
+  }
+  __syncthreads();
+
+  //  Two possibilities
+  //    Mutex is unlocked
+  //    Mutex is locked
+  while (!haveLock)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      // spin waiting for our location in the ring buffer to == 1.
+      if (atomicAdd(&ringBuffer[myRingBufferLoc], 0) == 1)
+      {
+        // atomicAdd (load) acts as a load acquire, need TF to enforce ordering
+        __threadfence();
+
+        // When our location in the ring buffer == 1, we have the lock
+        haveLock = true;
+      }
+      else
+      {
+        // if we failed in acquiring the lock, wait for a little while before
+        // trying again
+        //sleepFunc(backoff);
+        for (int j = 0; j < backoff; ++j) { ; }
+        // (capped) exponential backoff
+        backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
+      }
+    }
+    __syncthreads();
+  }
+
+  return myRingBufferLoc;
+}
+
+// to unlock, simply increment the ring buffer's head pointer.
+__device__ void hipMutexSleepUnlock(const hipMutex_t mutex,
+                                    int * mutexBuffers,
+                                    unsigned int myBufferLoc,
+                                    const int maxRingBufferSize,
+                                    const int arrayStride,
+                                    const int NUM_CU)
+{
+  __syncthreads();
+
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  int * ringBuffer = (int * )mutexBuffers + (mutex * NUM_CU) * arrayStride;
+  // next location is 0 if we're the last location in the buffer (wraparound)
+  const unsigned int nextBufferLoc = ((myBufferLoc >= maxRingBufferSize) ? 0 :
+                                      myBufferLoc + 1);
+
+  if (isMasterThread)
+  {
+    // set my ring buffer location to -1
+    atomicExch((int *)(ringBuffer + myBufferLoc), -1);
+
+    // set the next location in the ring buffer to 1 so that next WG in line
+    // can get the lock now
+    atomicExch((int *)ringBuffer + nextBufferLoc, 1);
+
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+  }
+  __syncthreads();
+}
+
+// same algorithm but uses per-CU lock
+__device__ unsigned int hipMutexSleepLockLocal(const hipMutex_t mutex,
+                                               const unsigned int cuID,
+                                               int * mutexBuffers,
+                                               unsigned int * mutexBufferTails,
+                                               const int maxRingBufferSize,
+                                               const int arrayStride,
+                                               const int NUM_CU)
+{
+  __syncthreads();
+
+  // local variables
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  unsigned int * const ringBufferTailPtr = mutexBufferTails + ((mutex * NUM_CU) +
+                                                               cuID);
+  int * const ringBuffer = (int * )mutexBuffers +
+    ((mutex * NUM_CU) + cuID) * arrayStride;
+
+  __shared__ unsigned int myRingBufferLoc;
+  __shared__ bool haveLock;
+
+  // this is a fire-and-forget atomic.
+  if (isMasterThread)
+  {
+    /*
+      HIP currently doesn't generate the correct code for atomicInc's here,
+      so replace with an atomicAdd of 1 and assume no wraparound
+    */
+    //myRingBufferLoc = atomicInc(ringBufferTailPtr, maxRingBufferSize);
+    myRingBufferLoc = atomicAdd(ringBufferTailPtr, 1);
+
+    haveLock = false; // initially we don't have the lock
+  }
+  __syncthreads();
+
+  //  Two possibilities
+  //    Mutex is unlocked
+  //    Mutex is locked
+  while (!haveLock)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      // spin waiting for our location in the ring buffer to == 1.
+      if (atomicAdd(&ringBuffer[myRingBufferLoc], 0) == 1)
+      {
+        // atomicAdd (load) acts as a load acquire, need TF to enforce ordering locally
+        __threadfence_block();
+
+        // When our location in the ring buffer == 1, we have the lock
+        haveLock = true;
+      }
+    }
+    __syncthreads();
+  }
+
+  return myRingBufferLoc;
+}
+
+// to unlock, simply increment the ring buffer's head pointer -- same algorithm
+// but uses per-CU lock.
+__device__ void hipMutexSleepUnlockLocal(const hipMutex_t mutex,
+                                         const unsigned int cuID,
+                                         int * mutexBuffers,
+                                         unsigned int myBufferLoc,
+                                         const int maxRingBufferSize,
+                                         const int arrayStride,
+                                         const int NUM_CU)
+{
+  __syncthreads();
+
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  int * ringBuffer = (int * )mutexBuffers + ((mutex * NUM_CU) + cuID) *
+                     arrayStride;
+  // next location is 0 if we're the last location in the buffer (wraparound)
+  const unsigned int nextBufferLoc = ((myBufferLoc >= maxRingBufferSize) ? 0 :
+                                      myBufferLoc + 1);
+
+  if (isMasterThread)
+  {
+    // set my ring buffer location to -1
+    atomicExch((int *)(ringBuffer + myBufferLoc), -1);
+
+    // set the next location in the ring buffer to 1 so that next WG in line
+    // can get the lock now
+    atomicExch((int *)ringBuffer + nextBufferLoc, 1);
+
+    // atomicExch acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+  }
+  __syncthreads();
+}
+
+#endif
diff --git a/src/heterosync/src/hipLocksMutexSpin.h b/src/heterosync/src/hipLocksMutexSpin.h
new file mode 100644
index 0000000..ebcbc34
--- /dev/null
+++ b/src/heterosync/src/hipLocksMutexSpin.h
@@ -0,0 +1,99 @@
+#ifndef __HIPLOCKSMUTEXSPIN_H__
+#define __HIPLOCKSMUTEXSPIN_H__
+
+#include "hip/hip_runtime.h"
+
+inline __host__ hipError_t hipMutexCreateSpin(hipMutex_t * const handle,
+                                              const int mutexNumber)
+{
+  *handle = mutexNumber;
+  return hipSuccess;
+}
+
+// This is the brain dead algorithm. Just spin on an atomic until you get the
+// lock.
+__device__ void hipMutexSpinLock(const hipMutex_t mutex,
+                                 unsigned int * mutexBufferHeads,
+                                 const int NUM_CU)
+{
+  __shared__ int done;
+  const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
+                               (hipThreadIdx_z == 0));
+  if (isMasterThread) { done = 0; }
+  __syncthreads();
+
+  while (!done)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      if (atomicCAS(mutexBufferHeads + (mutex * NUM_CU), 0, 1) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering
+        __threadfence();
+        done = 1;
+      }
+    }
+    __syncthreads();
+  }
+}
+
+__device__ void hipMutexSpinUnlock(const hipMutex_t mutex,
+                                   unsigned int * mutexBufferHeads,
+                                   const int NUM_CU)
+{
+  __syncthreads();
+  if (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0)
+  {
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+    atomicExch(mutexBufferHeads + (mutex * NUM_CU), 0);
+  }
+  __syncthreads();
+}
+
+// same algorithm but uses local TF instead because data is local
+__device__ void hipMutexSpinLockLocal(const hipMutex_t mutex,
+                                      const unsigned int cuID,
+                                      unsigned int * mutexBufferHeads,
+                                      const int NUM_CU)
+{
+  __shared__ int done;
+  const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
+                               (hipThreadIdx_z == 0));
+  if (isMasterThread) { done = 0; }
+  __syncthreads();
+
+  while (!done)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      if (atomicCAS(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0, 1) == 0)
+      {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering locally
+        __threadfence_block();
+        done = 1;
+      }
+    }
+    __syncthreads();
+  }
+}
+
+// same algorithm but uses local TF instead because data is local
+__device__ void hipMutexSpinUnlockLocal(const hipMutex_t mutex,
+                                        const unsigned int cuID,
+                                        unsigned int * mutexBufferHeads,
+                                        const int NUM_CU)
+{
+  __syncthreads();
+  if ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) && (hipThreadIdx_z == 0))
+  {
+    // atomicExch acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    // mutex math allows us to access the appropriate per-CU spin mutex location
+    atomicExch(mutexBufferHeads + ((mutex * NUM_CU) + cuID), 0);
+  }
+  __syncthreads();
+}
+
+#endif
diff --git a/src/heterosync/src/hipLocksSemaphore.h b/src/heterosync/src/hipLocksSemaphore.h
new file mode 100644
index 0000000..8481ab9
--- /dev/null
+++ b/src/heterosync/src/hipLocksSemaphore.h
@@ -0,0 +1,8 @@
+#ifndef __HIPLOCKSSEMAPHORE_H__
+#define __HIPLOCKSSEMAPHORE_H__
+
+#include "hipLocks.h"
+#include "hipLocksSemaphoreSpin.h"
+#include "hipLocksSemaphoreEBO.h"
+
+#endif
diff --git a/src/heterosync/src/hipLocksSemaphoreEBO.h b/src/heterosync/src/hipLocksSemaphoreEBO.h
new file mode 100644
index 0000000..0128de3
--- /dev/null
+++ b/src/heterosync/src/hipLocksSemaphoreEBO.h
@@ -0,0 +1,456 @@
+#ifndef __HIPSEMAPHOREEBO_H__
+#define __HIPSEMAPHOREEBO_H__
+
+#include "hip/hip_runtime.h"
+#include "hipLocks.h"
+
+inline __host__ hipError_t hipSemaphoreCreateEBO(hipSemaphore_t * const handle,
+                                                 const int semaphoreNumber,
+                                                 const unsigned int count,
+                                                 const int NUM_CU)
+{
+  // Here we set the initial value to be count+1, this allows us to do an
+  // atomicExch(sem, 0) and basically use the semaphore value as both a
+  // lock and a semaphore.
+  unsigned int initialValue = (count + 1), zero = 0;
+  *handle = semaphoreNumber;
+  for (int id = 0; id < NUM_CU; ++id) { // need to set these values for all CUs
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4))]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 1]), &zero, sizeof(zero), hipMemcpyHostToDevice);
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 2]), &zero, sizeof(zero), hipMemcpyHostToDevice);
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 3]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
+  }
+  return hipSuccess;
+}
+
+inline __device__ bool hipSemaphoreEBOTryWait(const hipSemaphore_t sem,
+                                              const bool isWriter,
+                                              const unsigned int maxSemCount,
+                                              unsigned int * semaphoreBuffers,
+                                              const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  /*
+    Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+    CU uses 4 of them (current count, head, tail, max count).  For the global
+    semaphore all CUs use semaphoreBuffers[sem * 4 * NUM_CU].
+  */
+  unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
+  unsigned int * const lock = currCount + 1;
+  /*
+    Reuse the tail for the "writers are waiting" flag since tail is unused.
+
+    For now just use to indicate that at least 1 writer is waiting instead of
+    a count to make sure that readers aren't totally starved out until all the
+    writers are done.
+  */
+  unsigned int * const writerWaiting = currCount + 2;
+  __shared__ bool acq1, acq2;
+
+  __syncthreads();
+  if (isMasterThread)
+  {
+    acq1 = false;
+    // try to acquire the sem head "lock"
+    if (atomicCAS(lock, 0, 1) == 0) {
+      // atomicCAS acts as a load acquire, need TF to enforce ordering
+      __threadfence();
+      acq1 = true;
+    }
+  }
+  __syncthreads();
+
+  if (!acq1) { return false; } // return if we couldn't acquire the lock
+  if (isMasterThread)
+  {
+    acq2 = false;
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    unsigned int currSemCount = currCount[0];
+
+    if (isWriter) {
+      // writer needs the count to be == maxSemCount to enter the critical
+      // section (otherwise there are readers in the critical section)
+      if (currSemCount == maxSemCount) { acq2 = true; }
+    } else {
+      // if there is a writer waiting, readers aren't allowed to enter the
+      // critical section
+      if (writerWaiting[0] == 0) {
+        // readers need count > 1 to enter critical section (otherwise semaphore
+        // is full)
+        if (currSemCount > 1) { acq2 = true; }
+      }
+    }
+  }
+  __syncthreads();
+
+  if (!acq2) // release the sem head "lock" since the semaphore was full
+  {
+    // writers set a flag to note that they are waiting so more readers don't
+    // join after the writer started waiting
+    if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }
+
+    if (isMasterThread) {
+      // atomicExch acts as a store release, need TF to enforce ordering
+      __threadfence();
+      atomicExch(lock, 0);
+    }
+    __syncthreads();
+    return false;
+  }
+  __syncthreads();
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    if (isWriter) {
+      /*
+        writer decrements the current count of the semaphore by the max to
+        ensure that no one else can enter the critical section while it's
+        writing.
+      */
+      currCount[0] -= maxSemCount;
+
+      // writers also need to unset the "writer is waiting" flag
+      writerWaiting[0] = 0;
+    } else {
+      /*
+        readers decrement the current count of the semaphore by 1 so other
+        readers can also read the data (but not the writers since they needs
+        the entire CS).
+      */
+      --currCount[0]; //atomicSub(currCount, 1);
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+
+  return true;
+}
+
+inline __device__ void hipSemaphoreEBOWait(const hipSemaphore_t sem,
+                                           const bool isWriter,
+                                           const unsigned int maxSemCount,
+                                           unsigned int * semaphoreBuffers,
+                                           const int NUM_CU)
+{
+  __shared__ int backoff;
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+
+  if (isMasterThread)
+  {
+    backoff = 1;
+  }
+  __syncthreads();
+
+  while (!hipSemaphoreEBOTryWait(sem, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      // if we failed to enter the semaphore, wait for a little while before
+      // trying again
+      //sleepFunc(backoff);
+      for (int j = 0; j < backoff; ++j) { ; }
+      /*
+        for writers increse backoff a lot because failing means readers are in
+        the CS currently -- most important for non-unique because all WGs on
+        all CUs are going for the same semaphore.
+      */
+      if (isWriter) {
+        // (capped) exponential backoff
+        backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
+      }
+      else { backoff += 5; /* small, linear backoff increase for readers */ }
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+}
+
+inline __device__ void hipSemaphoreEBOPost(const hipSemaphore_t sem,
+                                           const bool isWriter,
+                                           const unsigned int maxSemCount,
+                                           unsigned int * semaphoreBuffers,
+                                           const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  /*
+    Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+    CU uses 4 of them (current count, head, tail, max count).  For the global
+    semaphore use semaphoreBuffers[sem * 4 * NUM_CU].
+  */
+  unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
+  unsigned int * const lock = currCount + 1;
+  __shared__ bool acquired;
+
+  if (isMasterThread) { acquired = false; }
+  __syncthreads();
+
+  while (!acquired)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      // try to acquire sem head "lock"
+      if (atomicCAS(lock, 0, 1) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering
+        __threadfence();
+        acquired = true;
+      } else {
+        acquired = false;
+      }
+    }
+    __syncthreads();
+  }
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    if (isWriter) {
+      // writers add the max value to the semaphore to allow the readers to
+      // start accessing the critical section.
+      currCount[0] += maxSemCount;
+    } else {
+      ++currCount[0]; // readers add 1 to the semaphore
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+}
+
+// same wait algorithm but with local scope and per-CU synchronization
+inline __device__ bool hipSemaphoreEBOTryWaitLocal(const hipSemaphore_t sem,
+                                                   const unsigned int cuID,
+                                                   const bool isWriter,
+                                                   const unsigned int maxSemCount,
+                                                   unsigned int * semaphoreBuffers,
+                                                   const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  /*
+    Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+    CU gets 4 of them (current count, head, tail, max count).  So CU 0 starts
+    at semaphoreBuffers[sem * 4 * NUM_CU].
+  */
+  unsigned int * const currCount = semaphoreBuffers + ((sem * 4 * NUM_CU) +
+                                                       (cuID * 4));
+  unsigned int * const lock = currCount + 1;
+  /*
+    Reuse the tail for the "writers are waiting" flag since tail is unused.
+
+    For now just use to indicate that at least 1 writer is waiting instead of
+    a count to make sure that readers aren't totally starved out until all the
+    writers are done.
+  */
+  unsigned int * const writerWaiting = currCount + 2;
+  __shared__ bool acq1, acq2;
+
+  __syncthreads();
+  if (isMasterThread)
+  {
+    acq1 = false;
+    // try to acquire the sem head "lock"
+    if (atomicCAS(lock, 0, 1) == 0) {
+      // atomicCAS acts as a load acquire, need TF to enforce ordering locally
+      __threadfence_block();
+      acq1 = true;
+    }
+  }
+  __syncthreads();
+
+  if (!acq1) { return false; } // return if we couldn't acquire the lock
+  if (isMasterThread)
+  {
+    acq2 = false;
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    unsigned int currSemCount = currCount[0];
+
+    if (isWriter) {
+      // writer needs the count to be == maxSemCount to enter the critical
+      // section (otherwise there are readers in the critical section)
+      if (currSemCount == maxSemCount) { acq2 = true; }
+    } else {
+      // if there is a writer waiting, readers aren't allowed to enter the
+      // critical section
+      if (writerWaiting[0] == 0) {
+        // readers need count > 1 to enter critical section (otherwise semaphore
+        // is full)
+        if (currSemCount > 1) { acq2 = true; }
+      }
+    }
+  }
+  __syncthreads();
+
+  if (!acq2) // release the sem head "lock" since the semaphore was full
+  {
+    // writers set a flag to note that they are waiting so more readers don't
+    // join after the writer started waiting
+    if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }
+
+    if (isMasterThread) {
+      // atomicExch acts as a store release, need TF to enforce ordering locally
+      __threadfence_block();
+      atomicExch(lock, 0);
+    }
+    __syncthreads();
+    return false;
+  }
+  __syncthreads();
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    if (isWriter) {
+      /*
+        writer decrements the current count of the semaphore by the max to
+        ensure that no one else can enter the critical section while it's
+        writing.
+      */
+      currCount[0] -= maxSemCount;
+
+      // writers also need to unset the "writer is waiting" flag
+      writerWaiting[0] = 0;
+    } else {
+      /*
+        readers decrement the current count of the semaphore by 1 so other
+        readers can also read the data (but not the writers since they needs
+        the entire CS).
+      */
+      --currCount[0];
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+
+  return true;
+}
+
+// same algorithm but with local scope
+inline __device__ void hipSemaphoreEBOWaitLocal(const hipSemaphore_t sem,
+                                                const unsigned int cuID,
+                                                const bool isWriter,
+                                                const unsigned int maxSemCount,
+                                                unsigned int * semaphoreBuffers,
+                                                const int NUM_CU)
+{
+  __shared__ int backoff;
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+
+  if (isMasterThread)
+  {
+    backoff = 1;
+  }
+  __syncthreads();
+
+  while (!hipSemaphoreEBOTryWaitLocal(sem, cuID, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      /*
+        if we failed to enter the semaphore, wait for a little while before
+        trying again
+      */
+      //sleepFunc(backoff);
+      for (int j = 0; j < backoff; ++j) { ; }
+      // (capped) exponential backoff
+      backoff = (((backoff << 1) + 1) & (MAX_BACKOFF-1));
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+}
+
+inline __device__ void hipSemaphoreEBOPostLocal(const hipSemaphore_t sem,
+                                                const unsigned int cuID,
+                                                const bool isWriter,
+                                                const unsigned int maxSemCount,
+                                                unsigned int * semaphoreBuffers,
+                                                const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  // Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+  // CU gets 4 of them.  So CU 0 starts at semaphoreBuffers[sem * 4 * NUM_CU].
+  unsigned int * const currCount = semaphoreBuffers + ((sem * 4 * NUM_CU) +
+                                                       (cuID * 4));
+  unsigned int * const lock = currCount + 1;
+  __shared__ bool acquired;
+
+  if (isMasterThread) { acquired = false; }
+  __syncthreads();
+
+  while (!acquired)
+  {
+    __syncthreads();
+    if (isMasterThread)
+    {
+      // try to acquire sem head "lock"
+      if (atomicCAS(lock, 0, 1) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering locally
+        __threadfence_block();
+        acquired = true;
+      } else {
+        acquired = false;
+      }
+    }
+    __syncthreads();
+  }
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WGs at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    if (isWriter) {
+      // writers add the max value to the semaphore to allow the readers to
+      // start accessing the critical section.
+      currCount[0] += maxSemCount;
+    } else {
+      ++currCount[0]; // readers add 1 to the semaphore
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+}
+
+#endif // #ifndef __HIPSEMAPHOREEBO_H__
diff --git a/src/heterosync/src/hipLocksSemaphoreSpin.h b/src/heterosync/src/hipLocksSemaphoreSpin.h
new file mode 100644
index 0000000..dbe4b90
--- /dev/null
+++ b/src/heterosync/src/hipLocksSemaphoreSpin.h
@@ -0,0 +1,410 @@
+#ifndef __HIPLOCKSSEMAPHORESPIN_H__
+#define __HIPLOCKSSEMAPHORESPIN_H__
+
+#include "hip/hip_runtime.h"
+#include "hipLocks.h"
+
+inline __host__ hipError_t hipSemaphoreCreateSpin(hipSemaphore_t * const handle,
+                                                  const int semaphoreNumber,
+                                                  const unsigned int count,
+                                                  const int NUM_CU)
+{
+  // Here we set the initial value to be count+1, this allows us to do an
+  // atomicExch(sem, 0) and basically use the semaphore value as both a
+  // lock and a semaphore.
+  unsigned int initialValue = (count + 1), zero = 0;
+  *handle = semaphoreNumber;
+  for (int id = 0; id < NUM_CU; ++id) { // need to set these values for all CUs
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4))]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 1]), &zero, sizeof(zero), hipMemcpyHostToDevice);
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 2]), &zero, sizeof(zero), hipMemcpyHostToDevice);
+    hipMemcpy(&(cpuLockData->semaphoreBuffers[((semaphoreNumber * 4 * NUM_CU) + (id * 4)) + 3]), &initialValue, sizeof(initialValue), hipMemcpyHostToDevice);
+  }
+  return hipSuccess;
+}
+
+inline __device__ bool hipSemaphoreSpinTryWait(const hipSemaphore_t sem,
+                                               const bool isWriter,
+                                               const unsigned int maxSemCount,
+                                               unsigned int * semaphoreBuffers,
+                                               const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  /*
+    Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+    CU uses 4 of them (current count, head, tail, max count).  For the global
+    semaphore all CUs use semaphoreBuffers[sem * 4 * NUM_CU].
+  */
+  unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
+  unsigned int * const lock = currCount + 1;
+  /*
+    Reuse the tail for the "writers are waiting" flag since tail is unused.
+
+    For now just use to indicate that at least 1 writer is waiting instead of
+    a count to make sure that readers aren't totally starved out until all the
+    writers are done.
+  */
+  unsigned int * const writerWaiting = currCount + 2;
+  __shared__ bool acq1, acq2;
+
+  __syncthreads();
+  if (isMasterThread)
+  {
+    acq1 = false;
+    // try to acquire the sem head "lock"
+    if (atomicCAS(lock, 0, 1) == 0) {
+      // atomicCAS acts as a load acquire, need TF to enforce ordering
+      __threadfence();
+      acq1 = true;
+    }
+  }
+  __syncthreads();
+
+  if (!acq1) { return false; } // return if we couldn't acquire the lock
+  if (isMasterThread)
+  {
+    acq2 = false;
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    unsigned int currSemCount = currCount[0];
+
+    if (isWriter) {
+      // writer needs the count to be == maxSemCount to enter the critical
+      // section (otherwise there are readers in the critical section)
+      if (currSemCount == maxSemCount) { acq2 = true; }
+    } else {
+      // if there is a writer waiting, readers aren't allowed to enter the
+      // critical section
+      if (writerWaiting[0] == 0) {
+        // readers need count > 1 to enter critical section (otherwise semaphore
+        // is full)
+        if (currSemCount > 1) { acq2 = true; }
+      }
+    }
+  }
+  __syncthreads();
+
+  if (!acq2) // release the sem head "lock" since the semaphore was full
+  {
+    // writers set a flag to note that they are waiting so more readers don't
+    // join after the writer started waiting
+    if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }
+
+    if (isMasterThread) {
+      // atomicExch acts as a store release, need TF to enforce ordering
+      __threadfence();
+      atomicExch(lock, 0);
+    }
+    __syncthreads();
+    return false;
+  }
+  __syncthreads();
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    if (isWriter) {
+      /*
+        writer decrements the current count of the semaphore by the max to
+        ensure that no one else can enter the critical section while it's
+        writing.
+      */
+      currCount[0] -= maxSemCount;
+
+      // writers also need to unset the "writer is waiting" flag
+      writerWaiting[0] = 0;
+    } else {
+      // readers decrement the current count of the semaphore by 1 so other
+      // readers can also read the data (but not the writers since they needs
+      // the entire CS).
+      --currCount[0];
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+
+  return true;
+}
+
+inline __device__ void hipSemaphoreSpinWait(const hipSemaphore_t sem,
+                                            const bool isWriter,
+                                            const unsigned int maxSemCount,
+                                            unsigned int * semaphoreBuffers,
+                                            const int NUM_CU)
+{
+  while (!hipSemaphoreSpinTryWait(sem, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
+  {
+    __syncthreads();
+  }
+}
+
+inline __device__ void hipSemaphoreSpinPost(const hipSemaphore_t sem,
+                                            const bool isWriter,
+                                            const unsigned int maxSemCount,
+                                            unsigned int * semaphoreBuffers,
+                                            const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  /*
+    Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+    CU uses 4 of them (current count, head, tail, max count).  For the global
+    semaphore use semaphoreBuffers[sem * 4 * NUM_CU].
+  */
+  unsigned int * const currCount = semaphoreBuffers + (sem * 4 * NUM_CU);
+  unsigned int * const lock = currCount + 1;
+  __shared__ bool acquired;
+
+  if (isMasterThread) { acquired = false; }
+  __syncthreads();
+
+  while (!acquired)
+  {
+    if (isMasterThread)
+    {
+      /*
+        NOTE: This CAS will trigger an invalidation since we overload CAS's.
+        Since most of the data in the local critical section is written, it
+        hopefully won't affect performance too much.
+      */
+      // try to acquire sem head lock
+      if (atomicCAS(lock, 0, 1) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering
+        __threadfence();
+        acquired = true;
+      } else {
+        acquired = false;
+      }
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    if (isWriter) {
+      // writers add the max value to the semaphore to allow the readers to
+      // start accessing the critical section.
+      currCount[0] += maxSemCount;
+    } else {
+      // readers add 1 to the semaphore
+      ++currCount[0];
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering
+    __threadfence();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+}
+
+// same wait algorithm but with local scope and per-CU synchronization
+inline __device__ bool hipSemaphoreSpinTryWaitLocal(const hipSemaphore_t sem,
+                                                    const unsigned int cuID,
+                                                    const bool isWriter,
+                                                    const unsigned int maxSemCount,
+                                                    unsigned int * semaphoreBuffers,
+                                                    const int NUM_CU)
+{
+  const bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 &&
+                               hipThreadIdx_z == 0);
+  // Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+  // CU gets 4 of them (current count, head, tail, max count).  So CU 0 starts
+  // at semaphoreBuffers[sem * 4 * NUM_CU].
+  unsigned int * const currCount = semaphoreBuffers +
+                                       ((sem * 4 * NUM_CU) + (cuID * 4));
+  unsigned int * const lock = currCount + 1;
+  /*
+    Reuse the tail for the "writers are waiting" flag since tail is unused.
+
+    For now just use to indicate that at least 1 writer is waiting instead of
+    a count to make sure that readers aren't totally starved out until all the
+    writers are done.
+  */
+  unsigned int * const writerWaiting = currCount + 2;
+  __shared__ bool acq1, acq2;
+
+  __syncthreads();
+  if (isMasterThread)
+  {
+    acq1 = false;
+    // try to acquire the sem head "lock"
+    if (atomicCAS(lock, 0, 1) == 0) {
+      // atomicCAS acts as a load acquire, need TF to enforce ordering locally
+      __threadfence_block();
+      acq1 = true;
+    }
+  }
+  __syncthreads();
+
+  if (!acq1) { return false; } // return if we couldn't acquire the lock
+  if (isMasterThread)
+  {
+    acq2 = false;
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    unsigned int currSemCount = currCount[0];
+
+    if (isWriter) {
+      // writer needs the count to be == maxSemCount to enter the critical
+      // section (otherwise there are readers in the critical section)
+      if (currSemCount == maxSemCount) { acq2 = true; }
+    } else {
+      // if there is a writer waiting, readers aren't allowed to enter the
+      // critical section
+      if (writerWaiting[0] == 0) {
+        // readers need count > 1 to enter critical section (otherwise semaphore
+        // is full)
+        if (currSemCount > 1) { acq2 = true; }
+      }
+    }
+  }
+  __syncthreads();
+
+  if (!acq2) // release the sem head "lock" since the semaphore was full
+  {
+    // writers set a flag to note that they are waiting so more readers don't
+    // join after the writer started waiting
+    if (isWriter) { writerWaiting[0] = 1; /* if already 1, just reset to 1 */ }
+
+    if (isMasterThread) {
+      // atomicExch acts as a store release, need TF to enforce ordering locally
+      __threadfence_block();
+      atomicExch(lock, 0);
+    }
+    __syncthreads();
+    return false;
+  }
+  __syncthreads();
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+     */
+    if (isWriter) {
+      /*
+        writer decrements the current count of the semaphore by the max to
+        ensure that no one else can enter the critical section while it's
+        writing.
+      */
+      currCount[0] -= maxSemCount;
+
+      // writers also need to unset the "writer is waiting" flag
+      writerWaiting[0] = 0;
+    } else {
+      /*
+        readers decrement the current count of the semaphore by 1 so other
+        readers can also read the data (but not the writers since they needs
+        the entire CS).
+      */
+      --currCount[0];
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+
+  return true;
+}
+
+inline __device__ void hipSemaphoreSpinWaitLocal(const hipSemaphore_t sem,
+                                                 const unsigned int cuID,
+                                                 const bool isWriter,
+                                                 const unsigned int maxSemCount,
+                                                 unsigned int * semaphoreBuffers,
+                                                 const int NUM_CU)
+{
+  while (!hipSemaphoreSpinTryWaitLocal(sem, cuID, isWriter, maxSemCount, semaphoreBuffers, NUM_CU))
+  {
+    __syncthreads();
+  }
+}
+
+inline __device__ void hipSemaphoreSpinPostLocal(const hipSemaphore_t sem,
+                                                 const unsigned int cuID,
+                                                 const bool isWriter,
+                                                 const unsigned int maxSemCount,
+                                                 unsigned int * semaphoreBuffers,
+                                                 const int NUM_CU)
+{
+  bool isMasterThread = (hipThreadIdx_x == 0 && hipThreadIdx_y == 0 && hipThreadIdx_z == 0);
+  // Each sem has NUM_CU * 4 locations in the buffer.  Of these locations, each
+  // CU gets 4 of them.  So CU 0 starts at semaphoreBuffers[sem * 4 * NUM_CU].
+  unsigned int * const currCount = semaphoreBuffers +
+                                       ((sem * 4 * NUM_CU) + (cuID * 4));
+  unsigned int * const lock = currCount + 1;
+  __shared__ bool acquired;
+
+  if (isMasterThread) { acquired = false; }
+  __syncthreads();
+
+  while (!acquired)
+  {
+    if (isMasterThread)
+    {
+      /*
+        NOTE: This CAS will trigger an invalidation since we overload CAS's.
+        Since most of the data in the local critical section is written, it
+        hopefully won't affect performance too much.
+      */
+      // try to acquire sem head lock
+      if (atomicCAS(lock, 0, 1) == 0) {
+        // atomicCAS acts as a load acquire, need TF to enforce ordering locally
+        __threadfence_block();
+        acquired = true;
+      } else {
+        acquired = false;
+      }
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (isMasterThread) {
+    /*
+      NOTE: currCount is only accessed by 1 WG at a time and has a lock around
+      it, so we can safely access it as a regular data access instead of with
+      atomics.
+    */
+    if (isWriter) {
+      // writers add the max value to the semaphore to allow the readers to
+      // start accessing the critical section.
+      currCount[0] += maxSemCount;
+    } else {
+      // readers add 1 to the semaphore
+      ++currCount[0];
+    }
+
+    // atomicExch acts as a store release, need TF to enforce ordering locally
+    __threadfence_block();
+    // now that we've updated the semaphore count can release the lock
+    atomicExch(lock, 0);
+  }
+  __syncthreads();
+}
+
+#endif // #ifndef __HIPLOCKSSEMAPHORESPIN_H__
diff --git a/src/heterosync/src/hip_error.h b/src/heterosync/src/hip_error.h
new file mode 100644
index 0000000..df63dd0
--- /dev/null
+++ b/src/heterosync/src/hip_error.h
@@ -0,0 +1,14 @@
+#ifndef HIP_CHECK_ERROR
+#define HIP_CHECK_ERROR
+
+void inline checkError(hipError_t hipErr, const char * functWithError)
+{
+  if ( hipErr != hipSuccess )
+  {
+    fprintf(stderr, "ERROR %s - %s\n", functWithError,
+            hipGetErrorString(hipErr));
+    exit(-1);
+  }
+}
+
+#endif
diff --git a/src/heterosync/src/main.hip.cpp b/src/heterosync/src/main.hip.cpp
new file mode 100644
index 0000000..e4a2d0f
--- /dev/null
+++ b/src/heterosync/src/main.hip.cpp
@@ -0,0 +1,1786 @@
+#include <cstdio>
+#include <string>
+#include <assert.h>
+#include <math.h>
+#include "hip/hip_runtime.h"
+#include "hip_error.h"
+
+#define MAX_BACKOFF             1024
+#define NUM_WIS_PER_WG 64
+#define MAD_MUL 1.1f
+#define MAD_ADD 0.25f
+#define NUM_WORDS_PER_CACHELINE 16
+#define NUM_WIS_PER_QUARTERWAVE 16
+
+// separate .h files
+#include "hipLocksBarrier.h"
+#include "hipLocksImpl.h"
+#include "hipLocksMutex.h"
+#include "hipLocksSemaphore.h"
+// program globals
+const int NUM_REPEATS = 10;
+int NUM_LDST = 0;
+int numWGs = 0;
+// number of CUs our GPU has
+int NUM_CU = 0;
+int MAX_WGS = 0;
+
+bool pageAlign = false;
+
+/*
+  Helper function to do data accesses for golden checking code.
+*/
+void accessData_golden(float * storageGolden, int currLoc, int numStorageLocs)
+{
+  /*
+    If this location isn't the first location accessed by a
+    thread, update it -- each half-warp accesses (NUM_LDST + 1) cache
+    lines.
+  */
+  if (currLoc % (NUM_WIS_PER_QUARTERWAVE * (NUM_LDST + 1)) >=
+      NUM_WORDS_PER_CACHELINE)
+  {
+    assert((currLoc - NUM_WORDS_PER_CACHELINE) >= 0);
+    assert(currLoc < numStorageLocs);
+    // each location is dependent on the location accessed at the
+    // same word on the previous cache line
+    storageGolden[currLoc] =
+      ((storageGolden[currLoc -
+                      NUM_WORDS_PER_CACHELINE]/* * MAD_MUL*/) /*+ MAD_ADD*/);
+  }
+}
+
+/*
+  Shared function that does the critical section data accesses for the barrier
+  and mutex kernels.
+
+  NOTE: kernels that access data differently (e.g., semaphores) should not call
+  this function.
+*/
+inline __device__ void accessData(float * storage, int threadBaseLoc,
+                                  int threadOffset, int NUM_LDST)
+{
+  // local variables
+  int readLoc = 0, writeLoc = 0;
+
+  for (int n = NUM_LDST-1; n >= 0; --n) {
+    writeLoc = ((threadBaseLoc + n + 1) * NUM_WORDS_PER_CACHELINE) +
+               threadOffset;
+    readLoc = ((threadBaseLoc + n) * NUM_WORDS_PER_CACHELINE) + threadOffset;
+    storage[writeLoc] = ((storage[readLoc]/* * MAD_MUL*/) /*+ MAD_ADD*/);
+  }
+}
+
+/*
+  Helper function for semaphore writers.  Although semaphore writers are
+  iterating over all locations accessed on a given CU, the logic for this
+  varies and is done outside this helper, so can just call accessData().
+*/
+inline __device__ void accessData_semWr(float * storage, int threadBaseLoc,
+                                        int threadOffset, int NUM_LDST)
+{
+  accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+}
+
+
+/*
+  Helper function for semaphore readers.
+*/
+inline __device__ void accessData_semRd(float * storage,
+                                        volatile float * dummyArray,
+                                        int threadBaseLoc,
+                                        int threadOffset, int NUM_LDST)
+{
+  for (int n = NUM_LDST-1; n >= 0; --n) {
+    dummyArray[hipThreadIdx_x] +=
+      storage[((threadBaseLoc + n) * NUM_WORDS_PER_CACHELINE) +
+              threadOffset];
+    __syncthreads();
+  }
+}
+
+// performs a tree barrier.  Each WG on an CU accesses unique data then joins a
+// local barrier.  1 of the WGs from each CU then joins the global barrier
+__global__ void kernelAtomicTreeBarrierUniq(float * storage,
+                                            hipLockData_t * gpuLockData,
+                                            unsigned int * perCUBarrierBuffers,
+                                            const int ITERATIONS,
+                                            const int NUM_LDST,
+                                            const int NUM_CU,
+                                            const int MAX_WGS)
+{
+  // local variables
+  // thread 0 is master thread
+  const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
+                               (hipThreadIdx_z == 0));
+  // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
+  // fewer WGs than CUs).
+  const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x :
+                                        NUM_CU);
+  const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
+  // all work groups on the same CU access unique locations because the
+  // barrier can't ensure DRF between WGs
+  int currWGID = hipBlockIdx_x;
+  int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  // determine if I'm WG 0 on my CU
+  const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x / numWGsAtBarr);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    joinBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
+                       numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
+                       isMasterThread, MAX_WGS);
+
+    // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
+    // CUs data
+    currWGID = ((currWGID + 1) % hipGridDim_x);
+    tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+    threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  }
+}
+
+// like the tree barrier but also has WGs exchange work locally before joining
+// the global barrier
+__global__ void kernelAtomicTreeBarrierUniqLocalExch(float * storage,
+                                                     hipLockData_t * gpuLockData,
+                                                     unsigned int * perCUBarrierBuffers,
+                                                     const int ITERATIONS,
+                                                     const int NUM_LDST,
+                                                     const int NUM_CU,
+                                                     const int MAX_WGS)
+{
+  // local variables
+  // thread 0 is master thread
+  const bool isMasterThread = ((hipThreadIdx_x == 0) && (hipThreadIdx_y == 0) &&
+                               (hipThreadIdx_z == 0));
+  // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
+  // fewer WGs than CUs).
+  const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x :
+                                        NUM_CU);
+  const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
+  // all work groups on the same CU access unique locations because the
+  // barrier can't ensure DRF between WGs
+  int currWGID = hipBlockIdx_x;
+  int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  // determine if I'm WG 0 on my CU
+  const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x / numWGsAtBarr);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    // all WGs on this CU do a local barrier (if > 1 WG)
+    if (numWGs_perCU > 1) {
+      hipBarrierAtomicLocal(perCUBarrierBuffers, cuID, numWGs_perCU,
+                             isMasterThread, MAX_WGS);
+
+      // exchange data within the WGs on this CU, then do some more computations
+      currWGID = ((currWGID + numWGsAtBarr) % hipGridDim_x);
+      tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+      threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+
+      accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+    }
+
+    joinBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
+                       numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
+                       isMasterThread, MAX_WGS);
+
+    // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
+    // CUs data
+    currWGID = ((currWGID + 1) % hipGridDim_x);
+    tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+    threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  }
+}
+
+// performs a tree barrier like above but with a lock-free barrier
+__global__ void kernelFBSTreeBarrierUniq(float * storage,
+                                         hipLockData_t * gpuLockData,
+                                         unsigned int * perCUBarrierBuffers,
+                                         const int ITERATIONS,
+                                         const int NUM_LDST,
+                                         const int NUM_CU,
+                                         const int MAX_WGS)
+{
+  // local variables
+  // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
+  // fewer WGs than CUs).
+  const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x :
+                                        NUM_CU);
+  const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
+  // all work groups on the same CU access unique locations because the
+  // barrier can't ensure DRF between WGs
+  int currWGID = hipBlockIdx_x;
+  int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  // determine if I'm WG 0 on my CU
+  const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x/numWGsAtBarr);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    joinLFBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
+                         numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
+                         gpuLockData->arrayStride, MAX_WGS);
+
+    // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
+    // CUs data
+    currWGID = ((currWGID + 1) % hipGridDim_x);
+    tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+    threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  }
+}
+
+// performs a tree barrier like above but with a lock-free barrier and has WGs
+// exchange work locally before joining the global barrier
+__global__ void kernelFBSTreeBarrierUniqLocalExch(float * storage,
+                                                  hipLockData_t * gpuLockData,
+                                                  unsigned int * perCUBarrierBuffers,
+                                                  const int ITERATIONS,
+                                                  const int NUM_LDST,
+                                                  const int NUM_CU,
+                                                  const int MAX_WGS)
+{
+  // local variables
+  // represents the number of WGs going to the barrier (max NUM_CU, hipGridDim_x if
+  // fewer WGs than CUs).
+  const unsigned int numWGsAtBarr = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
+  const int cuID = (hipBlockIdx_x % numWGsAtBarr); // mod by # CUs to get CU ID
+  // all work groups on the same CU access unique locations because the
+  // barrier can't ensure DRF between WGs
+  int currWGID = hipBlockIdx_x;
+  int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  // determine if I'm WG 0 on my CU
+  const int perCU_WGID = (hipBlockIdx_x / numWGsAtBarr);
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x/numWGsAtBarr);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    // all WGs on this CU do a local barrier (if > 1 WG per CU)
+    if (numWGs_perCU > 1) {
+      hipBarrierLocal(gpuLockData->barrierBuffers, numWGsAtBarr,
+                       gpuLockData->arrayStride, perCUBarrierBuffers, cuID, numWGs_perCU,
+                       perCU_WGID, false, MAX_WGS);
+
+      // exchange data within the WGs on this CU and do some more computations
+      currWGID = ((currWGID + numWGsAtBarr) % hipGridDim_x);
+      tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+      threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+
+      accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+    }
+
+    joinLFBarrier_helper(gpuLockData->barrierBuffers, perCUBarrierBuffers,
+                         numWGsAtBarr, cuID, perCU_WGID, numWGs_perCU,
+                         gpuLockData->arrayStride, MAX_WGS);
+
+    // get new thread ID by trading amongst WGs -- + 1 WG ID to shift to next
+    // CUs data
+    currWGID = ((currWGID + 1) % hipGridDim_x);
+    tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+    threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  }
+}
+
+__global__ void kernelSleepingMutex(hipMutex_t mutex, float * storage,
+                                    hipLockData_t * gpuLockData,
+                                    const int ITERATIONS, const int NUM_LDST,
+                                    const int NUM_CU)
+{
+  // local variables
+  // all work groups access the same locations (rely on release to get
+  // ownership in time)
+  const int tid = hipThreadIdx_x;
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  __shared__ int myRingBufferLoc; // tracks my WGs location in the ring buffer
+
+  if (hipThreadIdx_x == 0) {
+    myRingBufferLoc = -1; // initially I have no location
+  }
+  __syncthreads();
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    myRingBufferLoc = hipMutexSleepLock(mutex, gpuLockData->mutexBuffers,
+                                        gpuLockData->mutexBufferTails, gpuLockData->maxBufferSize,
+                                        gpuLockData->arrayStride, NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexSleepUnlock(mutex, gpuLockData->mutexBuffers, myRingBufferLoc,
+                         gpuLockData->maxBufferSize, gpuLockData->arrayStride, NUM_CU);
+  }
+}
+
+__global__ void kernelSleepingMutexUniq(hipMutex_t mutex, float * storage,
+                                        hipLockData_t * gpuLockData,
+                                        const int ITERATIONS,
+                                        const int NUM_LDST, const int NUM_CU)
+{
+  // local variables
+  const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
+  // all work groups on the same CU access the same locations
+  const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  __shared__ int myRingBufferLoc; // tracks my WGs location in the ring buffer
+
+  if (hipThreadIdx_x == 0) {
+    myRingBufferLoc = -1; // initially I have no location
+  }
+  __syncthreads();
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    myRingBufferLoc = hipMutexSleepLockLocal(mutex, cuID, gpuLockData->mutexBuffers,
+                                              gpuLockData->mutexBufferTails, gpuLockData->maxBufferSize,
+                                              gpuLockData->arrayStride, NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexSleepUnlockLocal(mutex, cuID, gpuLockData->mutexBuffers, myRingBufferLoc,
+                              gpuLockData->maxBufferSize, gpuLockData->arrayStride,
+                              NUM_CU);
+  }
+}
+
+__global__ void kernelFetchAndAddMutex(hipMutex_t mutex, float * storage,
+                                       hipLockData_t * gpuLockData,
+                                       const int ITERATIONS,
+                                       const int NUM_LDST, const int NUM_CU)
+{
+  // local variables
+  // all work groups access the same locations (rely on release to get
+  // ownership in time)
+  const int tid = hipThreadIdx_x;
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    hipMutexFALock(mutex, gpuLockData->mutexBufferHeads, gpuLockData->mutexBufferTails,
+                    NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexFAUnlock(mutex, gpuLockData->mutexBufferTails, NUM_CU);
+  }
+}
+
+__global__ void kernelFetchAndAddMutexUniq(hipMutex_t mutex, float * storage,
+                                           hipLockData_t * gpuLockData,
+                                           const int ITERATIONS,
+                                           const int NUM_LDST,
+                                           const int NUM_CU)
+{
+  // local variables
+  const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
+  // all work groups on the same CU access the same locations
+  const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    hipMutexFALockLocal(mutex, cuID, gpuLockData->mutexBufferHeads,
+                         gpuLockData->mutexBufferTails, NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexFAUnlockLocal(mutex, cuID, gpuLockData->mutexBufferTails, NUM_CU);
+  }
+}
+
+__global__ void kernelSpinLockMutex(hipMutex_t mutex, float * storage,
+                                    hipLockData_t * gpuLockData,
+                                    const int ITERATIONS, const int NUM_LDST,
+                                    const int NUM_CU)
+{
+  // local variables
+  // all work groups access the same locations (rely on release to get
+  // ownership in time)
+  const int tid = hipThreadIdx_x;
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    hipMutexSpinLock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexSpinUnlock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
+  }
+}
+
+__global__ void kernelSpinLockMutexUniq(hipMutex_t mutex, float * storage,
+                                        hipLockData_t * gpuLockData,
+                                        const int ITERATIONS,
+                                        const int NUM_LDST, const int NUM_CU)
+{
+  // local variables
+  const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
+  // all work groups on the same CU access the same locations
+  const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    hipMutexSpinLockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexSpinUnlockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
+  }
+}
+
+__global__ void kernelEBOMutex(hipMutex_t mutex, float * storage,
+                               hipLockData_t * gpuLockData,
+                               const int ITERATIONS, const int NUM_LDST,
+                               const int NUM_CU)
+{
+  // local variables
+  // all work groups access the same locations (rely on release to get
+  // ownership in time)
+  const int tid = hipThreadIdx_x;
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    hipMutexEBOLock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexEBOUnlock(mutex, gpuLockData->mutexBufferHeads, NUM_CU);
+  }
+}
+
+__global__ void kernelEBOMutexUniq(hipMutex_t mutex, float * storage,
+                                   hipLockData_t * gpuLockData,
+                                   const int ITERATIONS, const int NUM_LDST,
+                                   const int NUM_CU)
+{
+  // local variables
+  const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
+  // all work groups on the same CU access the same locations
+  const int tid = ((cuID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  const int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    hipMutexEBOLockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
+
+    accessData(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+    hipMutexEBOUnlockLocal(mutex, cuID, gpuLockData->mutexBufferHeads, NUM_CU);
+  }
+}
+
+// All WGs on all CUs access the same data with 1 writer per CU (and N-1)
+// readers per CU.
+__global__ void kernelSpinLockSemaphore(hipSemaphore_t sem,
+                                        float * storage,
+                                        hipLockData_t * gpuLockData,
+                                        const unsigned int numStorageLocs,
+                                        const int ITERATIONS,
+                                        const int NUM_LDST,
+                                        const int NUM_CU)
+{
+  // local variables
+  const unsigned int maxSemCount =
+    gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
+  const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
+  // If there are fewer WGs than # CUs, need to take into account for various
+  // math below.  If WGs >= NUM_CU, use NUM_CU.
+  const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
+  // number of threads on each WG
+  //const int numThrs_perCU = (hipBlockDim_x * numWGs_perCU);
+  const int perCU_WGID = (hipBlockIdx_x / numCU);
+  // rotate which WG is the writer
+  const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
+
+  // all work groups on the same CU access unique locations except the writer,
+  // which writes all of the locations that all of the WGs access
+  //int currWGID = hipBlockIdx_x;
+  // the (reader) WGs on each CU access unique locations but those same
+  // locations are accessed by the reader WGs on all CUs
+  int tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  // dummy array to hold the loads done in the readers
+  __shared__ volatile float dummyArray[NUM_WIS_PER_WG];
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    /*
+      NOTE: There is a race here for entering the critical section.  Most
+      importantly, it means that the at least one of the readers could win and
+      thus the readers will read before the writer has had a chance to write
+      the data.
+    */
+    hipSemaphoreSpinWait(sem, isWriter, maxSemCount,
+                          gpuLockData->semaphoreBuffers, NUM_CU);
+
+    // writer writes all the data that the WGs on this CU access
+    if (isWriter) {
+      for (int j = 0; j < numWGs_perCU; ++j) {
+        /*
+          Update the writer's "location" so it writes to the locations that the
+          readers will access (due to RR scheduling the next WG on this CU is
+          numCU WGs away).  Use loop counter because the non-unique version
+          writes the same locations on all CUs.
+        */
+        tid = ((j * hipBlockDim_x) + hipThreadIdx_x);
+        threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+
+        accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
+      }
+      // reset locations
+      tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
+      threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+    }
+    // rest of WGs on this CU read the data written by each CU's writer WG
+    else {
+      accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
+                       NUM_LDST);
+    }
+    hipSemaphoreSpinPost(sem, isWriter, maxSemCount,
+                          gpuLockData->semaphoreBuffers, NUM_CU);
+  }
+}
+
+__global__ void kernelSpinLockSemaphoreUniq(hipSemaphore_t sem,
+                                            float * storage,
+                                            hipLockData_t * gpuLockData,
+                                            const int ITERATIONS,
+                                            const int NUM_LDST,
+                                            const int NUM_CU)
+{
+  // local variables
+  const unsigned int maxSemCount =
+      gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
+  // If there are fewer WGs than # CUs, need to take into account for various
+  // math below.  If WGs >= NUM_CU, use NUM_CU.
+  const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
+  const int cuID = (hipBlockIdx_x % numCU); // mod by # CUs to get CU ID
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
+  const int perCU_WGID = (hipBlockIdx_x / numCU);
+  // rotate which WG is the writer
+  const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
+
+  // all work groups on the same CU access unique locations except the writer,
+  // which writes all of the locations that all of the WGs access
+  int currWGID = hipBlockIdx_x;
+  int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  // dummy array to hold the loads done in the readers
+  __shared__ volatile float dummyArray[NUM_WIS_PER_WG];
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    /*
+      NOTE: There is a race here for entering the critical section.  Most
+      importantly, it means that the at least one of the readers could win and
+      thus the readers will read before the writer has had a chance to write
+      the data.
+    */
+    hipSemaphoreSpinWaitLocal(sem, cuID, isWriter, maxSemCount,
+                               gpuLockData->semaphoreBuffers, NUM_CU);
+
+    // writer WG writes all the data that the WGs on this CU access
+    if (isWriter) {
+      for (int j = 0; j < numWGs_perCU; ++j) {
+        accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+        /*
+          update the writer's "location" so it writes to the locations that the
+          readers will access (due to RR scheduling the next WG on this CU is
+          numCU WGs away and < hipGridDim_x).
+
+          NOTE: First location writer writes to is its own location(s).  If the
+          writer is not CU 0 on this CU, it may require wrapping around to CUs
+          with smaller WG IDs.
+        */
+        currWGID = (currWGID + numCU) % hipGridDim_x;
+        tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+        threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+      }
+      // reset locations
+      currWGID = hipBlockIdx_x;
+      tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+      threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+    }
+    // rest of WGs on this CU read the data written by each CU's writer WG
+    else {
+      accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
+                       NUM_LDST);
+    }
+    hipSemaphoreSpinPostLocal(sem, cuID, isWriter, maxSemCount,
+                               gpuLockData->semaphoreBuffers, NUM_CU);
+  }
+}
+
+// All WGs on all CUs access the same data with 1 writer per CU (and N-1)
+// readers per CU.
+__global__ void kernelEBOSemaphore(hipSemaphore_t sem, float * storage,
+                                   hipLockData_t * gpuLockData,
+                                   const unsigned int numStorageLocs,
+                                   const int ITERATIONS, const int NUM_LDST,
+                                   const int NUM_CU)
+{
+  // local variables
+  const unsigned int maxSemCount =
+      gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
+  // If there are fewer WGs than # CUs, need to take into account for various
+  // math below.  If WGs >= NUM_CU, use NUM_CU.
+  const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
+  const int cuID = (hipBlockIdx_x % NUM_CU); // mod by # CUs to get CU ID
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
+  // number of threads on each WG
+  //const int numThrs_perCU = (hipBlockDim_x * numWGs_perCU);
+  const int perCU_WGID = (hipBlockIdx_x / numCU);
+  // rotate which WG is the writer
+  const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
+
+  // all work groups on the same CU access unique locations except the writer,
+  // which writes all of the locations that all of the WGs access
+  //int currWGID = hipBlockIdx_x;
+  // the (reader) WGs on each CU access unique locations but those same
+  // locations are accessed by the reader WGs on all CUs
+  int tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+
+  // dummy array to hold the loads done in the readers
+  __shared__ volatile float dummyArray[NUM_WIS_PER_WG];
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    /*
+      NOTE: There is a race here for entering the critical section.  Most
+      importantly, it means that the at least one of the readers could win and
+      thus the readers will read before the writer has had a chance to write
+      the data.
+    */
+   hipSemaphoreEBOWait(sem, isWriter, maxSemCount,
+                        gpuLockData->semaphoreBuffers, NUM_CU);
+
+    // writer WG writes all the data that the WGs on this CU access
+    if (isWriter) {
+      for (int j = 0; j < numWGs_perCU; ++j) {
+        /*
+          Update the writer's "location" so it writes to the locations that the
+          readers will access (due to RR scheduling the next WG on this CU is
+          numCU WGs away).  Use loop counter because the non-unique version
+          writes the same locations on all CUs.
+        */
+        tid = ((j * hipBlockDim_x) + hipThreadIdx_x);
+        threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+
+        accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
+      }
+      // reset locations
+      tid = ((perCU_WGID * hipBlockDim_x) + hipThreadIdx_x);
+      threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+    }
+    // rest of WGs on this CU read the data written by each CU's writer WG
+    else {
+      accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
+                       NUM_LDST);
+    }
+    hipSemaphoreEBOPost(sem, isWriter, maxSemCount,
+                         gpuLockData->semaphoreBuffers, NUM_CU);
+  }
+}
+
+__global__ void kernelEBOSemaphoreUniq(hipSemaphore_t sem, float * storage,
+                                       hipLockData_t * gpuLockData,
+                                       const int ITERATIONS,
+                                       const int NUM_LDST,
+                                       const int NUM_CU)
+{
+  // local variables
+  const unsigned int maxSemCount =
+      gpuLockData->semaphoreBuffers[((sem * 4 * NUM_CU) + (0 * 4))];
+  // If there are fewer WGs than # CUs, need to take into account for various
+  // math below.  If WGs >= NUM_CU, use NUM_CU.
+  const unsigned int numCU = ((hipGridDim_x < NUM_CU) ? hipGridDim_x : NUM_CU);
+  const int cuID = (hipBlockIdx_x % numCU); // mod by # CUs to get CU ID
+  // given the hipGridDim_x, we can figure out how many WGs are on our CU -- assume
+  // all CUs have an identical number of WGs
+  int numWGs_perCU = (int)ceil((float)hipGridDim_x / numCU);
+  const int perCU_WGID = (hipBlockIdx_x / numCU);
+  // rotate which WG is the writer
+  const bool isWriter = (perCU_WGID == (cuID % numWGs_perCU));
+
+  // all work groups on the same CU access unique locations except the writer,
+  // which writes all of the locations that all of the WGs access
+  int currWGID = hipBlockIdx_x;
+  int tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+  // want striding to happen across cache lines so that each thread in a
+  // half-warp accesses sequential words
+  int threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+  const int threadOffset = (hipThreadIdx_x % NUM_WORDS_PER_CACHELINE);
+  // dummy array to hold the loads done in the readers
+  __shared__ volatile float dummyArray[NUM_WIS_PER_WG];
+
+  for (int i = 0; i < ITERATIONS; ++i)
+  {
+    /*
+      NOTE: There is a race here for entering the critical section.  Most
+      importantly, it means that the at least one of the readers could win and
+      thus the readers will read before the writer has had a chance to write
+      the data.
+    */
+    hipSemaphoreEBOWaitLocal(sem, cuID, isWriter, maxSemCount,
+                              gpuLockData->semaphoreBuffers, NUM_CU);
+
+    // writer WG writes all the data that the WGs on this CU access
+    if (isWriter) {
+      for (int j = 0; j < numWGs_perCU; ++j) {
+        accessData_semWr(storage, threadBaseLoc, threadOffset, NUM_LDST);
+
+        /*
+          update the writer's "location" so it writes to the locations that the
+          readers will access (due to RR scheduling the next WG on this CU is
+          numCU WGs away and < hipGridDim_x).
+
+          NOTE: First location writer writes to is its own location(s).  If the
+          writer is not CU 0 on this CU, it may require wrapping around to CUs
+          with smaller WG IDs.
+        */
+        currWGID = (currWGID + numCU) % hipGridDim_x;
+        tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+        threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+      }
+      // reset locations
+      currWGID = hipBlockIdx_x;
+      tid = ((currWGID * hipBlockDim_x) + hipThreadIdx_x);
+      threadBaseLoc = ((tid/NUM_WORDS_PER_CACHELINE) * (NUM_LDST+1));
+    }
+    // rest of WGs on this CU read the data written by each CU's writer WG
+    else {
+      accessData_semRd(storage, dummyArray, threadBaseLoc, threadOffset,
+                       NUM_LDST);
+    }
+    hipSemaphoreEBOPostLocal(sem, cuID, isWriter, maxSemCount,
+                              gpuLockData->semaphoreBuffers, NUM_CU);
+  }
+}
+
+void invokeAtomicTreeBarrier(float * storage_d, unsigned int * perCUBarriers_d,
+                             int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelAtomicTreeBarrierUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
+        MAX_WGS);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelAtomicTreeBarrierUniq)");
+  }
+}
+
+void invokeAtomicTreeBarrierLocalExch(float * storage_d,
+                                      unsigned int * perCUBarriers_d,
+                                      int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelAtomicTreeBarrierUniqLocalExch), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
+        MAX_WGS);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr,
+               "hipDeviceSynchronize (kernelAtomicTreeBarrierUniqLockExch)");
+  }
+}
+
+void invokeFBSTreeBarrier(float * storage_d, unsigned int * perCUBarriers_d,
+                          int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFBSTreeBarrierUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
+        MAX_WGS);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelFBSTreeBarrierUniq)");
+  }
+}
+
+void invokeFBSTreeBarrierLocalExch(float * storage_d,
+                                   unsigned int * perCUBarriers_d,
+                                   int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFBSTreeBarrierUniqLocalExch), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        storage_d, cpuLockData, perCUBarriers_d, numIters, NUM_LDST, NUM_CU,
+        MAX_WGS);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelFBSTreeBarrierUniqLocalExch)");
+  }
+}
+
+void invokeSpinLockMutex(hipMutex_t mutex, float * storage_d, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockMutex), dim3(1, 1, 1), dim3(NUM_WIS_PER_WG), 0, 0,
+        mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockMutex)");
+  }
+}
+
+void invokeSpinLockMutex_uniq(hipMutex_t mutex, float * storage_d,
+                              int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockMutexUniq)");
+  }
+}
+
+void invokeEBOMutex(hipMutex_t mutex, float * storage_d, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelEBOMutex)");
+  }
+}
+
+void invokeEBOMutex_uniq(hipMutex_t mutex, float * storage_d, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelEBOMutexUniq)");
+  }
+}
+
+void invokeSleepingMutex(hipMutex_t mutex, float * storage_d, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSleepingMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+                       mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelSleepingMutex)");
+  }
+}
+
+void invokeSleepingMutex_uniq(hipMutex_t mutex, float * storage_d,
+                              int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSleepingMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelSleepingMutexUniq)");
+  }
+}
+
+void invokeFetchAndAddMutex(hipMutex_t mutex, float * storage_d, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFetchAndAddMutex), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelFetchAndAddMutex)");
+  }
+}
+
+void invokeFetchAndAddMutex_uniq(hipMutex_t mutex, float * storage_d, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelFetchAndAddMutexUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        mutex, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelFetchAndAddMutexUniq)");
+  }
+}
+
+void invokeSpinLockSemaphore(hipSemaphore_t sem, float * storage_d,
+                             const int maxVal,
+                             int numIters, int numStorageLocs)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockSemaphore), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        sem, storage_d, cpuLockData, numStorageLocs, numIters, NUM_LDST,
+        NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockSemaphore)");
+  }
+}
+
+void invokeSpinLockSemaphore_uniq(hipSemaphore_t sem, float * storage_d,
+                                  const int maxVal, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelSpinLockSemaphoreUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        sem, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelSpinLockSemaphoreUniq)");
+  }
+}
+
+void invokeEBOSemaphore(hipSemaphore_t sem, float * storage_d, const int maxVal,
+                        int numIters, int numStorageLocs)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOSemaphore), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        sem, storage_d, cpuLockData, numStorageLocs, numIters, NUM_LDST,
+        NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelEBOSemaphore)");
+  }
+}
+
+void invokeEBOSemaphore_uniq(hipSemaphore_t sem, float * storage_d,
+                             const int maxVal, int numIters)
+{
+  // local variable
+  const int WGs = numWGs;
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(kernelEBOSemaphoreUniq), dim3(WGs), dim3(NUM_WIS_PER_WG), 0, 0,
+        sem, storage_d, cpuLockData, numIters, NUM_LDST, NUM_CU);
+
+    // Blocks until the device has completed all preceding requested
+    // tasks (make sure that the device returned before continuing).
+    hipError_t hipErr = hipDeviceSynchronize();
+    checkError(hipErr, "hipDeviceSynchronize (kernelEBOSemaphoreUniq)");
+  }
+}
+
+int main(int argc, char ** argv)
+{
+  if (argc != 5) {
+    fprintf(stderr, "./allSyncPrims-1kernel <syncPrim> <numLdSt> <numWGs> "
+            "<numCSIters>\n");
+    fprintf(stderr, "where:\n");
+    fprintf(stderr, "\t<syncPrim>: a string that represents which synchronization primitive to run.\n"
+            "\t\tatomicTreeBarrUniq - Atomic Tree Barrier\n"
+            "\t\tatomicTreeBarrUniqLocalExch - Atomic Tree Barrier with local exchange\n"
+            "\t\tlfTreeBarrUniq - Lock-Free Tree Barrier\n"
+            "\t\tlfTreeBarrUniqLocalExch - Lock-Free Tree Barrier with local exchange\n"
+            "\t\tspinMutex - Spin Lock Mutex\n"
+            "\t\tspinMutexEBO - Spin Lock Mutex with Backoff\n"
+            "\t\tsleepMutex - Sleep Mutex\n"
+            "\t\tfaMutex - Fetch-and-Add Mutex\n"
+            "\t\tspinSem1 - Spin Semaphore (Max: 1)\n"
+            "\t\tspinSem2 - Spin Semaphore (Max: 2)\n"
+            "\t\tspinSem10 - Spin Semaphore (Max: 10)\n"
+            "\t\tspinSem120 - Spin Semaphore (Max: 120)\n"
+            "\t\tspinSemEBO1 - Spin Semaphore with Backoff (Max: 1)\n"
+            "\t\tspinSemEBO2 - Spin Semaphore with Backoff (Max: 2)\n"
+            "\t\tspinSemEBO10 - Spin Semaphore with Backoff (Max: 10)\n"
+            "\t\tspinSemEBO120 - Spin Semaphore with Backoff (Max: 120)\n"
+            "\t\tspinMutexUniq - Spin Lock Mutex -- accesses to unique locations per WG\n"
+            "\t\tspinMutexEBOUniq - Spin Lock Mutex with Backoff -- accesses to unique locations per WG\n"
+            "\t\tsleepMutexUniq - Sleep Mutex -- accesses to unique locations per WG\n"
+            "\t\tfaMutexUniq - Fetch-and-Add Mutex -- accesses to unique locations per WG\n"
+            "\t\tspinSemUniq1 - Spin Semaphore (Max: 1) -- accesses to unique locations per WG\n"
+            "\t\tspinSemUniq2 - Spin Semaphore (Max: 2) -- accesses to unique locations per WG\n"
+            "\t\tspinSemUniq10 - Spin Semaphore (Max: 10) -- accesses to unique locations per WG\n"
+            "\t\tspinSemUniq120 - Spin Semaphore (Max: 120) -- accesses to unique locations per WG\n"
+            "\t\tspinSemEBOUniq1 - Spin Semaphore with Backoff (Max: 1) -- accesses to unique locations per WG\n"
+            "\t\tspinSemEBOUniq2 - Spin Semaphore with Backoff (Max: 2) -- accesses to unique locations per WG\n"
+            "\t\tspinSemEBOUniq10 - Spin Semaphore with Backoff (Max: 10) -- accesses to unique locations per WG\n"
+            "\t\tspinSemEBOUniq120 - Spin Semaphore with Backoff (Max: 120) -- accesses to unique locations per WG\n");
+    fprintf(stderr, "\t<numLdSt>: the # of LDs and STs to do for each thread "
+            "in the critical section.\n");
+    fprintf(stderr, "\t<numWGs>: the # of WGs to execute (want to be "
+            "divisible by the number of CUs).\n");
+    fprintf(stderr, "\t<numCSIters>: number of iterations of the critical "
+            "section.\n");
+    exit(-1);
+  }
+
+  // boilerplate code to identify compute capability, # CU/CUM/CUX, etc.
+  int deviceCount;
+  hipGetDeviceCount(&deviceCount);
+  if (deviceCount == 0) {
+    fprintf(stderr, "There is no device supporting HIP\n");
+    exit(-1);
+  }
+
+  hipDeviceProp_t deviceProp;
+  hipGetDeviceProperties(&deviceProp, 0);
+  fprintf(stdout, "GPU Compute Capability: %d.%d\n", deviceProp.major,
+          deviceProp.minor);
+  if ((deviceProp.major == 9999) && (deviceProp.minor == 9999)) {
+    fprintf(stderr, "There is no HIP capable device\n");
+    exit(-1);
+  }
+
+  NUM_CU = deviceProp.multiProcessorCount;
+  const int maxWGPerCU = deviceProp.maxThreadsPerBlock/NUM_WIS_PER_WG;
+  //assert(maxWGPerCU * NUM_WIS_PER_WG <=
+  //       deviceProp.maxThreadsPerMultiProcessor);
+  MAX_WGS = maxWGPerCU * NUM_CU;
+
+  fprintf(stdout, "# CU: %d, Max Thrs/WG: %d, Max WG/CU: %d, Max # WG: %d\n",
+          NUM_CU, deviceProp.maxThreadsPerBlock, maxWGPerCU, MAX_WGS);
+
+  hipError_t hipErr = hipGetLastError();
+  checkError(hipErr, "Begin");
+
+  // parse input args
+  const char * syncPrim_str = argv[1];
+  NUM_LDST = atoi(argv[2]);
+  numWGs = atoi(argv[3]);
+  assert(numWGs <= MAX_WGS);
+  const int NUM_ITERS = atoi(argv[4]);
+  const int numWGs_perCU = (int)ceil((float)numWGs / NUM_CU);
+  assert(numWGs_perCU > 0);
+
+  unsigned int syncPrim = 9999;
+  // set the syncPrim variable to the appropriate value based on the inputted
+  // string for the microbenchmark
+  if (strcmp(syncPrim_str, "atomicTreeBarrUniq") == 0) { syncPrim = 0; }
+  else if (strcmp(syncPrim_str, "atomicTreeBarrUniqLocalExch") == 0) {
+    syncPrim = 1;
+  }
+  else if (strcmp(syncPrim_str, "lfTreeBarrUniq") == 0) { syncPrim = 2; }
+  else if (strcmp(syncPrim_str, "lfTreeBarrUniqLocalExch") == 0) {
+    syncPrim = 3;
+  }
+  else if (strcmp(syncPrim_str, "spinMutex") == 0) { syncPrim = 4; }
+  else if (strcmp(syncPrim_str, "spinMutexEBO") == 0) { syncPrim = 5; }
+  else if (strcmp(syncPrim_str, "sleepMutex") == 0) { syncPrim = 6; }
+  else if (strcmp(syncPrim_str, "faMutex") == 0) { syncPrim = 7; }
+  else if (strcmp(syncPrim_str, "spinSem1") == 0) { syncPrim = 8; }
+  else if (strcmp(syncPrim_str, "spinSem2") == 0) { syncPrim = 9; }
+  else if (strcmp(syncPrim_str, "spinSem10") == 0) { syncPrim = 10; }
+  else if (strcmp(syncPrim_str, "spinSem120") == 0) { syncPrim = 11; }
+  else if (strcmp(syncPrim_str, "spinSemEBO1") == 0) { syncPrim = 12; }
+  else if (strcmp(syncPrim_str, "spinSemEBO2") == 0) { syncPrim = 13; }
+  else if (strcmp(syncPrim_str, "spinSemEBO10") == 0) { syncPrim = 14; }
+  else if (strcmp(syncPrim_str, "spinSemEBO120") == 0) { syncPrim = 15; }
+  // cases 16-19 reserved
+  else if (strcmp(syncPrim_str, "spinMutexUniq") == 0) { syncPrim = 20; }
+  else if (strcmp(syncPrim_str, "spinMutexEBOUniq") == 0) { syncPrim = 21; }
+  else if (strcmp(syncPrim_str, "sleepMutexUniq") == 0) { syncPrim = 22; }
+  else if (strcmp(syncPrim_str, "faMutexUniq") == 0) { syncPrim = 23; }
+  else if (strcmp(syncPrim_str, "spinSemUniq1") == 0) { syncPrim = 24; }
+  else if (strcmp(syncPrim_str, "spinSemUniq2") == 0) { syncPrim = 25; }
+  else if (strcmp(syncPrim_str, "spinSemUniq10") == 0) { syncPrim = 26; }
+  else if (strcmp(syncPrim_str, "spinSemUniq120") == 0) { syncPrim = 27; }
+  else if (strcmp(syncPrim_str, "spinSemEBOUniq1") == 0) { syncPrim = 28; }
+  else if (strcmp(syncPrim_str, "spinSemEBOUniq2") == 0) { syncPrim = 29; }
+  else if (strcmp(syncPrim_str, "spinSemEBOUniq10") == 0) { syncPrim = 30; }
+  else if (strcmp(syncPrim_str, "spinSemEBOUniq120") == 0) { syncPrim = 31; }
+  // cases 32-36 reserved
+  else
+  {
+    fprintf(stderr, "ERROR: Unknown synchronization primitive: %s\n",
+            syncPrim_str);
+    exit(-1);
+  }
+
+  // multiply number of mutexes, semaphores by NUM_CU to
+  // allow per-core locks
+  hipLocksInit(MAX_WGS, 8 * NUM_CU, 24 * NUM_CU, pageAlign, NUM_CU, NUM_REPEATS, NUM_ITERS);
+
+  hipErr = hipGetLastError();
+  checkError(hipErr, "After hipLocksInit");
+
+  /*
+    The barriers need a per-CU barrier that is not part of the global synch
+    structure.  In terms of size, for the lock-free barrier there are 2 arrays
+    in here -- inVars and outVars.  Each needs to be sized to hold the maximum
+    number of WGs/CU and each CU needs an array.
+
+    The atomic barrier per-CU synchronization fits inside the lock-free size
+    requirements so we can reuse the same locations.
+  */
+  unsigned int * perCUBarriers = (unsigned int *)malloc(sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
+
+  int numLocsMult = 0;
+  // barriers and unique semaphores have numWGs WGs accessing unique locations
+  if ((syncPrim < 4) ||
+      ((syncPrim >= 24) && (syncPrim <= 35))) { numLocsMult = numWGs; }
+  // The non-unique mutex microbenchmarks, all WGs access the same locations so
+  // multiplier is 1
+  else if ((syncPrim >= 4) && (syncPrim <= 7)) { numLocsMult = 1; }
+  // The non-unique semaphores have 1 writer and numWGs_perCU - 1 readers per CU
+  // so the multiplier is numWGs_perCU
+  else if ((syncPrim >= 8) && (syncPrim <= 19)) { numLocsMult = numWGs_perCU; }
+  // For the unique mutex microbenchmarks and condition variable, all WGs on
+  // same CU access same data so multiplier is NUM_CU.
+  else if (((syncPrim >= 20) && (syncPrim <= 23)) ||
+           (syncPrim == 36)) { numLocsMult = ((numWGs < NUM_CU) ?
+                                              numWGs : NUM_CU); }
+  else { // should never reach here
+    fprintf(stderr, "ERROR: Unknown syncPrim: %u\n", syncPrim);
+    exit(-1);
+  }
+
+  // each thread in a WG accesses NUM_LDST locations but accesses
+  // per WI are offset so that each subsequent access is dependent
+  // on the previous one -- thus need an extra access per WI.
+  int numUniqLocsAccPerWG = (NUM_WIS_PER_WG * (NUM_LDST + 1));
+  assert(numUniqLocsAccPerWG > 0);
+  int numStorageLocs = (numLocsMult * numUniqLocsAccPerWG);
+  assert(numStorageLocs > 0);
+  float * storage = (float *)malloc(sizeof(float) * numStorageLocs);
+
+  fprintf(stdout, "# WGs: %d, # Ld/St: %d, # Locs Mult: %d, # Uniq Locs/WG: %d, # Storage Locs: %d\n", numWGs, NUM_LDST, numLocsMult, numUniqLocsAccPerWG, numStorageLocs);
+
+  // initialize storage
+  for (int i = 0; i < numStorageLocs; ++i) { storage[i] = i; }
+  // initialize per-CU barriers to 0's
+  for (int i = 0; i < (NUM_CU * MAX_WGS * 2); ++i) { perCUBarriers[i] = 0; }
+
+  // gpu copies of storage and perCUBarriers
+  //float elapsedTime = 0.0f;
+  unsigned int * perCUBarriers_d = NULL;
+  float * storage_d = NULL;
+
+  hipMalloc(&perCUBarriers_d, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2));
+  hipMalloc(&storage_d, sizeof(float) * numStorageLocs);
+
+  hipMemcpy(perCUBarriers_d, perCUBarriers, sizeof(unsigned int) * (NUM_CU * MAX_WGS * 2), hipMemcpyHostToDevice);
+  hipMemcpy(storage_d, storage, sizeof(float) * numStorageLocs, hipMemcpyHostToDevice);
+
+  // lock variables
+  hipMutex_t spinMutex, faMutex, sleepMutex, eboMutex;
+  hipMutex_t spinMutex_uniq, faMutex_uniq, sleepMutex_uniq, eboMutex_uniq;
+  hipSemaphore_t spinSem1, eboSem1,
+                  spinSem2, eboSem2,
+                  spinSem10, eboSem10,
+                  spinSem120, eboSem120;
+  hipSemaphore_t spinSem1_uniq, eboSem1_uniq,
+                  spinSem2_uniq, eboSem2_uniq,
+                  spinSem10_uniq, eboSem10_uniq,
+                  spinSem120_uniq, eboSem120_uniq;
+  switch (syncPrim) {
+    case 0: // atomic tree barrier doesn't require any special fields to be
+            // created
+      printf("atomic_tree_barrier_%03d\n", NUM_ITERS); fflush(stdout);
+      break;
+    case 1: // atomic tree barrier with local exchange doesn't require any
+            // special fields to be created
+      printf("atomic_tree_barrier_localExch_%03d\n", NUM_ITERS); fflush(stdout);
+      break;
+    case 2: // lock-free tree barrier doesn't require any special fields to be
+            // created
+      printf("fbs_tree_barrier_%03d\n", NUM_ITERS); fflush(stdout);
+      break;
+    case 3: // lock-free barrier with local exchange doesn't require any
+            // special fields to be created
+      printf("fbs_tree_barrier_localExch_%03d\n", NUM_ITERS); fflush(stdout);
+      break;
+    case 4:
+      printf("spin_lock_mutex_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateSpin     (&spinMutex,          0);
+      break;
+    case 5:
+      printf("ebo_mutex_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateEBO      (&eboMutex,           1);
+      break;
+    case 6:
+      printf("sleeping_mutex_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateSleep    (&sleepMutex,         2);
+      break;
+    case 7:
+      printf("fetchadd_mutex_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateFA       (&faMutex,            3);
+      break;
+    case 8:
+      printf("spin_lock_sem_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem1,      0,   1, NUM_CU);
+      break;
+    case 9:
+      printf("spin_lock_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem2,      1,   2, NUM_CU);
+      break;
+    case 10:
+      printf("spin_lock_sem_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem10,      2,   10, NUM_CU);
+      break;
+    case 11:
+      printf("spin_lock_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem120,      3,   120, NUM_CU);
+      break;
+    case 12:
+      printf("ebo_sem_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem1,       4,   1, NUM_CU);
+      break;
+    case 13:
+      printf("ebo_sem_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem2,       5,   2, NUM_CU);
+      break;
+    case 14:
+      printf("ebo_sem_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem10,       6,   10, NUM_CU);
+      break;
+    case 15:
+      printf("ebo_sem_%03d_%03d\n", 120, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem120,       7,   120, NUM_CU);
+      break;
+    // cases 16-19 reserved
+    case 16:
+      break;
+    case 17:
+      break;
+    case 18:
+      break;
+    case 19:
+      break;
+    case 20:
+      printf("spin_lock_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateSpin     (&spinMutex_uniq,          4);
+      break;
+    case 21:
+      printf("ebo_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateEBO      (&eboMutex_uniq,           5);
+      break;
+    case 22:
+      printf("sleeping_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateSleep    (&sleepMutex_uniq,         6);
+      break;
+    case 23:
+      printf("fetchadd_mutex_uniq_%03d\n", NUM_ITERS); fflush(stdout);
+      hipMutexCreateFA       (&faMutex_uniq,            7);
+      break;
+    case 24:
+      printf("spin_lock_sem_uniq_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem1_uniq,      12,   1, NUM_CU);
+      break;
+    case 25:
+      printf("spin_lock_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem2_uniq,      13,   2, NUM_CU);
+      break;
+    case 26:
+      printf("spin_lock_sem_uniq_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem10_uniq,      14,   10, NUM_CU);
+      break;
+    case 27:
+      printf("spin_lock_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateSpin (&spinSem120_uniq,      15,   120, NUM_CU);
+      break;
+    case 28:
+      printf("ebo_sem_uniq_%03d_%03d\n", 1, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem1_uniq,       16,   1, NUM_CU);
+      break;
+    case 29:
+      printf("ebo_sem_uniq_%03d_%03d\n", 2, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem2_uniq,       17,   2, NUM_CU);
+      break;
+    case 30:
+      printf("ebo_sem_uniq_%03d_%03d\n", 10, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem10_uniq,       18,   10, NUM_CU);
+      break;
+    case 31:
+      printf("ebo_sem_uniq_%03d_%03d\n", 120, NUM_ITERS); fflush(stdout);
+      hipSemaphoreCreateEBO  (&eboSem120_uniq,       19,   120, NUM_CU);
+      break;
+    // cases 32-36 reserved
+    case 32:
+      break;
+    case 33:
+      break;
+    case 34:
+      break;
+    case 35:
+      break;
+    case 36:
+      break;
+    default:
+      fprintf(stderr, "ERROR: Trying to run synch prim #%u, but only 0-36 are "
+              "supported\n", syncPrim);
+      exit(-1);
+      break;
+  }
+
+  // # WGs must be < maxBufferSize or sleep mutex ring buffer won't work
+  if ((syncPrim == 6) || (syncPrim == 22)) {
+    assert(MAX_WGS <= cpuLockData->maxBufferSize);
+  }
+
+  // NOTE: region of interest begins here
+  hipDeviceSynchronize();
+
+  switch (syncPrim) {
+    case 0: // atomic tree barrier
+      invokeAtomicTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      break;
+    case 1: // atomic tree barrier with local exchange
+      invokeAtomicTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      break;
+    case 2: // lock-free barrier
+      invokeFBSTreeBarrier(storage_d, perCUBarriers_d, NUM_ITERS);
+      break;
+    case 3: // lock-free barrier with local exchange
+      invokeFBSTreeBarrierLocalExch(storage_d, perCUBarriers_d, NUM_ITERS);
+      break;
+    case 4: // Spin Lock Mutex
+      invokeSpinLockMutex   (spinMutex,  storage_d, NUM_ITERS);
+      break;
+    case 5: // Spin Lock Mutex with backoff
+      invokeEBOMutex        (eboMutex,   storage_d, NUM_ITERS);
+      break;
+    case 6: // Sleeping Mutex
+      invokeSleepingMutex   (sleepMutex, storage_d, NUM_ITERS);
+      break;
+    case 7: // fetch-and-add mutex
+      invokeFetchAndAddMutex(faMutex,    storage_d, NUM_ITERS);
+      break;
+    case 8: // spin semaphore (1)
+      invokeSpinLockSemaphore(spinSem1,   storage_d,   1, NUM_ITERS, numStorageLocs);
+      break;
+    case 9: // spin semaphore (2)
+      invokeSpinLockSemaphore(spinSem2,   storage_d,   2, NUM_ITERS, numStorageLocs);
+      break;
+    case 10: // spin semaphore (10)
+      invokeSpinLockSemaphore(spinSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      break;
+    case 11: // spin semaphore (120)
+      invokeSpinLockSemaphore(spinSem120,   storage_d,   120, NUM_ITERS, numStorageLocs);
+      break;
+    case 12: // spin semaphore with backoff (1)
+      invokeEBOSemaphore(eboSem1,   storage_d,     1, NUM_ITERS, numStorageLocs);
+      break;
+    case 13: // spin semaphore with backoff (2)
+      invokeEBOSemaphore(eboSem2,   storage_d,     2, NUM_ITERS, numStorageLocs);
+      break;
+    case 14: // spin semaphore with backoff (10)
+      invokeEBOSemaphore(eboSem10,   storage_d,   10, NUM_ITERS, numStorageLocs);
+      break;
+    case 15: // spin semaphore with backoff (120)
+      invokeEBOSemaphore(eboSem120,   storage_d, 120, NUM_ITERS, numStorageLocs);
+      break;
+    // cases 16-19 reserved
+    case 16:
+      break;
+    case 17:
+      break;
+    case 18:
+      break;
+    case 19:
+      break;
+    case 20: // Spin Lock Mutex (uniq)
+      invokeSpinLockMutex_uniq   (spinMutex_uniq,  storage_d, NUM_ITERS);
+      break;
+    case 21: // Spin Lock Mutex with backoff (uniq)
+      invokeEBOMutex_uniq        (eboMutex_uniq,   storage_d, NUM_ITERS);
+      break;
+    case 22: // Sleeping Mutex (uniq)
+      invokeSleepingMutex_uniq   (sleepMutex_uniq, storage_d, NUM_ITERS);
+      break;
+    case 23: // fetch-and-add mutex (uniq)
+      invokeFetchAndAddMutex_uniq(faMutex_uniq,    storage_d, NUM_ITERS);
+      break;
+    case 24: // spin semaphore (1) (uniq)
+      invokeSpinLockSemaphore_uniq(spinSem1_uniq,   storage_d,   1, NUM_ITERS);
+      break;
+    case 25: // spin semaphore (2) (uniq)
+      invokeSpinLockSemaphore_uniq(spinSem2_uniq,   storage_d,   2, NUM_ITERS);
+      break;
+    case 26: // spin semaphore (10) (uniq)
+      invokeSpinLockSemaphore_uniq(spinSem10_uniq,   storage_d,   10, NUM_ITERS);
+      break;
+    case 27: // spin semaphore (120) (uniq)
+      invokeSpinLockSemaphore_uniq(spinSem120_uniq,   storage_d,   120, NUM_ITERS);
+      break;
+    case 28: // spin semaphore with backoff (1) (uniq)
+      invokeEBOSemaphore_uniq(eboSem1_uniq,   storage_d,     1, NUM_ITERS);
+      break;
+    case 29: // spin semaphore with backoff (2) (uniq)
+      invokeEBOSemaphore_uniq(eboSem2_uniq,   storage_d,     2, NUM_ITERS);
+      break;
+    case 30: // spin semaphore with backoff (10) (uniq)
+      invokeEBOSemaphore_uniq(eboSem10_uniq,   storage_d,   10, NUM_ITERS);
+      break;
+    case 31: // spin semaphore with backoff (120) (uniq)
+      invokeEBOSemaphore_uniq(eboSem120_uniq,   storage_d, 120, NUM_ITERS);
+      break;
+    // cases 32-36 reserved
+    case 32:
+      break;
+    case 33:
+      break;
+    case 34:
+      break;
+    case 35:
+      break;
+    case 36:
+      break;
+    default:
+      fprintf(stderr,
+              "ERROR: Trying to run synch prim #%u, but only 0-36 are "
+              "supported\n",
+              syncPrim);
+      exit(-1);
+      break;
+  }
+
+  // NOTE: Can end simulation here if don't care about output checking
+  hipDeviceSynchronize();
+
+  // copy results back to compare to golden
+  hipMemcpy(storage, storage_d, sizeof(float) * numStorageLocs, hipMemcpyDeviceToHost);
+
+  // get golden results
+  float storageGolden[numStorageLocs];
+  int numLocsAccessed = 0, currLoc = 0;
+  // initialize
+  for (int i = 0; i < numStorageLocs; ++i) { storageGolden[i] = i; }
+
+  for (int repeat = 0; repeat < NUM_REPEATS; ++repeat)
+  {
+    for (int j = 0; j < NUM_ITERS; ++j)
+    {
+      /*
+        The barrier algorithms exchange data across CUs, so we need to perform
+        the exchanges in the golden code.
+
+        The barrier algorithms with local exchange exchange data both across
+        CUs and across WGs within an CU, so need to perform both in the golden
+        code.
+      */
+      if (syncPrim < 4)
+      {
+        // Some kernels only access a fraction of the total # of locations,
+        // determine how many locations are accessed by each kernel here.
+        numLocsAccessed = (numWGs * numUniqLocsAccPerWG);
+
+        // first cache line of words aren't written to
+        for (int i = (numLocsAccessed-1); i >= 0; --i)
+        {
+          // every iteration of the critical section, the location being
+          // accessed is shifted by numUniqLocsAccPerWG
+          currLoc = (i + (j * numUniqLocsAccPerWG)) % numLocsAccessed;
+
+          accessData_golden(storageGolden, currLoc, numStorageLocs);
+        }
+
+        // local exchange versions do additional accesses when there are
+        // multiple WGs on a CU
+        if ((syncPrim == 1) || (syncPrim == 3))
+        {
+          if (numWGs_perCU > 1)
+          {
+            for (int i = (numLocsAccessed-1); i >= 0; --i)
+            {
+              // advance the current location by the number of unique locations
+              // accessed by a CU (mod the number of memory locations accessed)
+              currLoc = (i + (numWGs_perCU * numUniqLocsAccPerWG)) %
+                        numLocsAccessed;
+              // every iteration of the critical section, the location being
+              // accessed is also shifted by numUniqLocsAccPerWG
+              currLoc = (currLoc + (j * numUniqLocsAccPerWG)) %
+                        numLocsAccessed;
+
+              accessData_golden(storageGolden, currLoc, numStorageLocs);
+            }
+          }
+        }
+      }
+      /*
+        In the non-unique mutex microbenchmarks (4-7), all WGs on all CUs access
+        the same locations.
+      */
+      else if ((syncPrim >= 4) && (syncPrim <= 7))
+      {
+        // need to iterate over the locations for each WG since all WGs
+        // access the same locations
+        for (int WG = 0; WG < numWGs; ++WG)
+        {
+          for (int i = (numUniqLocsAccPerWG-1); i >= 0; --i)
+          {
+            accessData_golden(storageGolden, i, numStorageLocs);
+          }
+        }
+      }
+      /*
+        In the non-unique semaphore microbenchmarks (8-19), 1 "writer" WG
+        per CU writes all the locations accessed by that CU (i.e.,
+        numUniqLocsAccPerWG * numWGs_perCU).  Moreover, all writer WGs across
+        all CUs access the same locations.
+      */
+      else if ((syncPrim <= 19) && (syncPrim >= 8))
+      {
+        int cuID = 0, perCU_wgID = 0;
+        const int numCU = ((numWGs < NUM_CU) ? numWGs : NUM_CU);
+        bool isWriter = false;
+
+        // need to iterate over the locations for each WG since all WGs
+        // access the same locations
+        for (int wg = 0; wg < numWGs; ++wg)
+        {
+          cuID = (wg % numCU);
+          perCU_wgID = (wg / numCU);
+          // which WG is writer varies per CU
+          isWriter = (perCU_wgID == (cuID % numWGs_perCU));
+
+          if (isWriter)
+          {
+            for (int k = 0; k < numWGs_perCU; ++k)
+            {
+              // first cache line of words aren't written to
+              for (int i = (numUniqLocsAccPerWG-1); i >= 0; --i)
+              {
+                /*
+                  The locations the writer is writing are numUniqLocsAccPerWG
+                  apart because the WGs are assigned in round-robin fashion.
+                  Thus, need to shift the location accordingly.
+                */
+                currLoc = (i + (k * numUniqLocsAccPerWG)) % numStorageLocs;
+                accessData_golden(storageGolden, currLoc, numStorageLocs);
+              }
+            }
+          }
+        }
+      }
+      /*
+        In the unique mutex microbenchmarks (20-23), all WGs on a CU access
+        the same data and the data accessed by each CU is unique.
+      */
+      else if ((syncPrim <= 23) && (syncPrim >= 20))
+      {
+        // Some kernels only access a fraction of the total # of locations,
+        // determine how many locations are accessed by each kernel here.
+        numLocsAccessed = (numWGs * numUniqLocsAccPerWG);
+
+        // first cache line of words aren't written to
+        for (int i = (numLocsAccessed-1); i >= 0; --i)
+        {
+          /*
+            If this location would be accessed by a WG other than the first
+            WG on an CU, wraparound and access the same location as the
+            first WG on the CU -- only for the mutexes, for semaphores this
+            isn't true.
+          */
+          currLoc = i % (NUM_CU * numUniqLocsAccPerWG);
+
+          accessData_golden(storageGolden, currLoc, numStorageLocs);
+        }
+      }
+      /*
+        In the unique semaphore microbenchmarks (24-35), 1 "writer" WG per
+        CU writes all the locations accessed by that CU, but each CU accesses
+        unique data.  We model this behavior by accessing all of the data
+        accessed by all CUs, since this has the same effect (assuming same
+        number of WGs/CU).
+      */
+      else
+      {
+        // Some kernels only access a fraction of the total # of locations,
+        // determine how many locations are accessed by each kernel here.
+        numLocsAccessed = (numWGs * numUniqLocsAccPerWG);
+
+        // first cache line of words aren't written to
+        for (int i = (numLocsAccessed-1); i >= 0; --i)
+        {
+          accessData_golden(storageGolden, i, numStorageLocs);
+        }
+      }
+    }
+  }
+
+  fprintf(stdout, "Comparing GPU results to golden results:\n");
+  unsigned int numErrors = 0;
+  // check the output values
+  for (int i = 0; i < numStorageLocs; ++i)
+  {
+    if (std::abs(storage[i] - storageGolden[i]) > 1E-5)
+    {
+      fprintf(stderr, "\tERROR: storage[%d] = %f, golden[%d] = %f\n", i,
+              storage[i], i, storageGolden[i]);
+      ++numErrors;
+    }
+  }
+  if (numErrors > 0)
+  {
+    fprintf(stderr, "ERROR: %s has %u output errors\n", syncPrim_str,
+            numErrors);
+    exit(-1);
+  }
+  else { fprintf(stdout, "PASSED!\n"); }
+
+  // free arrays
+  hipLocksDestroy();
+  hipFree(storage_d);
+  hipFree(perCUBarriers_d);
+  free(storage);
+  free(perCUBarriers);
+
+  return 0;
+}
diff --git a/src/heterosync/src/syncPrims_results.sh b/src/heterosync/src/syncPrims_results.sh
new file mode 100644
index 0000000..27f7d75
--- /dev/null
+++ b/src/heterosync/src/syncPrims_results.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+NUM_RUNS=1
+NUM_CS_ITERS=10
+EXECUTABLES="allSyncPrims-1kernel"
+SYNCPRIMS="atomicTreeBarrUniq atomicTreeBarrUniqLocalExch lfTreeBarrUniq lfTreeBarrUniqLocalExch spinMutex spinMutexEBO faMutex sleepMutex spinSem10 spinSemEBO10 spinMutexUniq spinMutexEBOUniq faMutexUniq sleepMutexUniq spinSemUniq10 spinSemEBOUniq10" # syncPrims to run
+#NUM_LDST="10 100 1000"
+NUM_LDST="10"
+# This GPU has 2 CUs, so want 1 WG/CU, 2 WG/CU, 4 WG/CU, 8 WG/CU (max allowed)
+NUM_WGS="2 4 8 16"
+
+# do the prescribed number of runs for each executable, print out all runtimes
+for executable in $EXECUTABLES;
+do 
+    echo "Beginning $executable's tests"
+
+    for syncPrim in $SYNCPRIMS;
+    do
+        echo -e "\tsyncPrim = $syncPrim"
+
+        for numLdSt in $NUM_LDST
+        do
+            echo -e "\t\tnumLdSt = $numLdSt"
+
+            for numWGs in $NUM_WGS
+            do
+                echo -e "\t\t\tnumWGs = $numWGs"
+
+                # ./allSyncPrims-1kernel <syncPrim> <numLdSt> <numWGs> <numCSIters>
+                echo -e "\t\t\t\t./$executable $syncPrim $numLdSt $numWGs $NUM_CS_ITERS"
+                for (( j=0; j<$NUM_RUNS; j++ ))
+                do
+                    #echo -e "\t\t\tRun $j"
+                    duration[$j]=0
+                    duration[$j]=`./$executable $syncPrim $numLdSt $numWGs $NUM_CS_ITERS | grep "average" | cut -f3 -d: | cut -f1 -dm`
+                    #echo -e "\t\t\t\tduration = ${duration[$j]}"
+                done
+
+                # print out all runtimes, comma separated, for this executable
+                echo -n -e "\t\t\t\t\tAll durations for $executable: "
+                for (( j=0; j<$NUM_RUNS; j++ ))
+                do
+                    echo -n "${duration[$j]}, "
+                done
+                echo "" # newline
+
+                # average the duration for these threads
+                threadnum_duration=0
+                for (( j=0; j<$NUM_RUNS; j++ ))
+                do
+                    #echo -e "\tBeginning $j""th run's write"
+                    threadnum_duration=`echo $threadnum_duration"+"${duration[$j]}"/"$NUM_RUNS|bc -l`
+                done
+                echo -e "\t\t\t\tAverage = $threadnum_duration"
+            done
+        done
+    done
+done
+