documentation: Update circular buffer for load-acquire/store-release

This commit replaces full barriers by targeted use of load-acquire and
store-release.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
[ paulmck: Restore comments as suggested by David Howells. ]
diff --git a/Documentation/circular-buffers.txt b/Documentation/circular-buffers.txt
index a36bed3..88951b1 100644
--- a/Documentation/circular-buffers.txt
+++ b/Documentation/circular-buffers.txt
@@ -160,6 +160,7 @@
 	spin_lock(&producer_lock);
 
 	unsigned long head = buffer->head;
+	/* The spin_unlock() and next spin_lock() provide needed ordering. */
 	unsigned long tail = ACCESS_ONCE(buffer->tail);
 
 	if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
@@ -168,9 +169,8 @@
 
 		produce_item(item);
 
-		smp_wmb(); /* commit the item before incrementing the head */
-
-		ACCESS_ONCE(buffer->head) = (head + 1) & (buffer->size - 1);
+		smp_store_release(buffer->head,
+				  (head + 1) & (buffer->size - 1));
 
 		/* wake_up() will make sure that the head is committed before
 		 * waking anyone up */
@@ -200,21 +200,20 @@
 
 	spin_lock(&consumer_lock);
 
-	unsigned long head = ACCESS_ONCE(buffer->head);
+	/* Read index before reading contents at that index. */
+	unsigned long head = smp_load_acquire(buffer->head);
 	unsigned long tail = buffer->tail;
 
 	if (CIRC_CNT(head, tail, buffer->size) >= 1) {
-		/* read index before reading contents at that index */
-		smp_rmb();
 
 		/* extract one item from the buffer */
 		struct item *item = buffer[tail];
 
 		consume_item(item);
 
-		smp_mb(); /* finish reading descriptor before incrementing tail */
-
-		ACCESS_ONCE(buffer->tail) = (tail + 1) & (buffer->size - 1);
+		/* Finish reading descriptor before incrementing tail. */
+		smp_store_release(buffer->tail,
+				  (tail + 1) & (buffer->size - 1));
 	}
 
 	spin_unlock(&consumer_lock);
@@ -223,15 +222,17 @@
 the new item, and then it shall make sure the CPU has finished reading the item
 before it writes the new tail pointer, which will erase the item.
 
-
-Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
-This prevents the compiler from discarding and reloading its cached value -
-which some compilers will do across smp_read_barrier_depends().  This isn't
-strictly needed if you can be sure that the opposition index will _only_ be
-used the once.  Similarly, ACCESS_ONCE() is used in both algorithms to
-write the thread's index.  This documents the fact that we are writing
-to something that can be read concurrently and also prevents the compiler
-from tearing the store.
+Note the use of ACCESS_ONCE() and smp_load_acquire() to read the
+opposition index.  This prevents the compiler from discarding and
+reloading its cached value - which some compilers will do across
+smp_read_barrier_depends().  This isn't strictly needed if you can
+be sure that the opposition index will _only_ be used the once.
+The smp_load_acquire() additionally forces the CPU to order against
+subsequent memory references.  Similarly, smp_store_release() is used
+in both algorithms to write the thread's index.  This documents the
+fact that we are writing to something that can be read concurrently,
+prevents the compiler from tearing the store, and enforces ordering
+against previous accesses.
 
 
 ===============