src/parsec/disk-image/parsec/parsec-benchmark/pkgs/apps/raytrace/src/doc/demo_code_snippets/fasttransfers.cpp - public/gem5-resources - Git at Google

 /*
  *
  * Example code to demonstrate the benefits of PBOs for fast
  * transfers to the GPU.
  *
  * (c) 2006 Dominik Göddeke, University of Dortmund
  * dominik.goeddeke@math.uni-dortmund.de
  *
  * For details, please refer to the tutorial available at
  * www.mathematik.uni-dortmund.de/~goeddeke/gpgpu/tutorial3.html
  *
  */

 #include <stdio.h>
 #include <stdlib.h>
 #include <GL/glew.h>
 #include <GL/glut.h>
 #include <Cg/cg.h>
 #include <Cg/cgGL.h>
 #include <time.h>
 #include <assert.h>


 /*********************************************************
  *    macros, problem size definition etc.               *
  *********************************************************/


 // PBO macro (see spec for details)
 #define BUFFER_OFFSET(i) ((char *)NULL + (i))


 // array size N is texsize*texsize
 static const int texSize = 1025;
 static const int N = texSize*texSize;

 // hard-coded for shader + Cg, do not change this
 static const int numArraysPerChunk = 9;
 // input data size, default setting results in approx 1.6 GB of phys mem.
 static const int numChunks = 20;
 // number of iterations per chunk to emulate kernel workload
 static const int numKernelSteps = 1;
 // number of iterations the test is repeated, to reduce timing noise.
 static const int numIterations = 10;


 // uncomment the following to check for GL and Cg errors
 // Note that enabling this creates an (artificial) sync point (by glGetError())
 //#define DEBUGDEBUG


 //
 // Enable this #define to enable PBOs for glTexSubImage() and glReadPixels().
 // Otherwise, PBOs are not used.
 //
 #define USE_PBO


 // my own memcheck macro, only works on linux. Use with care, this will
 // slow down the application significantly.
 #define MEMCHECK() system("top -b -n1 | grep test | awk '{print $5\" \"$6}'")


 /****************************************************************************/

 //
 // forward declaration of some functions
 //
 void cgErrorCallback(void);
 void checkErrors(const char *label);
 void copy(double* in, float* out);
 void copy(double* in, void* out);
 void copy(void* in, double* out);


 /*****************************************************/

 //
 // fragment program: out = accum + vec
 //
 const char* kernelSource = " float main(                    "\
         "            uniform samplerRECT accum,             " \
         "            uniform samplerRECT vec1,              " \
         "            uniform samplerRECT vec2,              " \
         "            uniform samplerRECT vec3,              " \
         "            uniform samplerRECT vec4,              " \
         "            uniform samplerRECT vec5,              " \
         "            uniform samplerRECT vec6,              " \
         "            uniform samplerRECT vec7,              " \
         "            uniform samplerRECT vec8,              " \
         "            uniform samplerRECT vec9,              " \
         "            in float4 coords:TEXCOORD0) : COLOR {  " \
     "  return texRECT(accum,coords.xy) +                " \
         "  texRECT(vec1,coords.xy)  +                       " \
         "  texRECT(vec2,coords.xy)  +                       " \
         "  texRECT(vec3,coords.xy)  +                       " \
         "  texRECT(vec4,coords.xy)  +                       " \
         "  texRECT(vec5,coords.xy)  +                       " \
         "  texRECT(vec6,coords.xy)  +                       " \
         "  texRECT(vec7,coords.xy)  +                       " \
         "  texRECT(vec8,coords.xy)  +                       " \
         "  texRECT(vec9,coords.xy); }                       ";

 #ifdef DEBUGDEBUG
 //
 // Cg error callback
 //
 void cgErrorCallback(void) {
     CGerror lastError = cgGetError();
     if(lastError)
         printf(cgGetErrorString(lastError));
     exit(-1);
 }

 //
 // GL error checking
 //
 void checkErrors(const char *label) {
     GLenum errCode;
     const GLubyte *errStr;
     if ((errCode = glGetError()) != GL_NO_ERROR) {
         errStr = gluErrorString(errCode);
     printf("%s: OpenGL ERROR ",label);
     printf((char*)errStr);
         printf("\n");
     exit(-2);
     }
 }
 #endif


 //
 // Conversion functions from double to float, the versions
 // using the void pointers are used with PBOs.
 // For details, see below where PBOs are actually used.
 // Note that I rely on the compiler to memalign these properly.
 // Whoever knows a way to do this operation for variable N
 // (not compile-time constant) in the most efficient way,
 // please drop me a note.
 //
 void copy(double* in, float* out) {
     for (int i=0; i<N; i++)
     out[i] = (float)in[i];
 }
 void copy(double* in, void* out) {
     float* bla = (float*)out;
     for (int i=0; i<N; i++)
     bla[i] = (float)in[i];
 }
 void copy(void* in, double* out) {
     float* bla = (float*)in;
     for (int i=0; i<N; i++)
     out[i] = bla[i];
 }


 //
 // this does all the relevant things
 //
 int main(int argc, char **argv) {
     assert (numArraysPerChunk == 9); // because kernel is hard coded
     //
     // set up glut to get valid GL context and
     // get extension entry points
     //
     glutInit (&argc, argv);
     glutCreateWindow("STREAMING TUTORIAL");
     glewInit();
     //
     // viewport transform for 1:1 'pixel=texel=data' mapping
     //
     glMatrixMode(GL_PROJECTION);
     glLoadIdentity();
     gluOrtho2D(0.0,texSize,0.0,texSize);
     glMatrixMode(GL_MODELVIEW);
     glLoadIdentity();
     glViewport(0,0,texSize,texSize);
     //
     // create FBO and bind it
     //
     GLuint fb;
     glGenFramebuffersEXT(1,&fb);
     glBindFramebufferEXT(GL_FRAMEBUFFER_EXT,fb);
     //
     // create a whole bunch of double precision data
     //
     printf("---------------------------------------------\n");
     printf("Creating input data, if the following amount \n");
     printf("is greater than the physically available RAM,\n");
     printf("press CTRL-C immediately to avoid HD paging. \n");
     // double precision arrays
     double memrequirements = (double)(numChunks*numArraysPerChunk + numChunks)*N*sizeof(double);
     // single precision textures
     memrequirements += (numArraysPerChunk + 2) * N * sizeof(float);
     // convert to megabytes
     memrequirements /= (1024.0 * 1024.0);
     printf("Allocating %.2f MB.\n",memrequirements);
 #ifdef USE_PBO
     printf("PBO transfers are enabled.\n");
 #else
     printf("PBO transfers are disabled.\n");
 #endif
     // array to store intial value and final result, initially zeroed
     double* data = new double[N];
     for (int i=0; i<N; i++)
     data[i] = 0.0;
     // input arrays: arrays in chunk one contain values from 1..9,
     // in chunk two from 10..19, and so on
     double*** chunks = new double**[numChunks];
     for (int chunk=0; chunk<numChunks; chunk++) {
     chunks[chunk] = new double*[numArraysPerChunk];
     for (int array=0; array<numArraysPerChunk; array++) {
         chunks[chunk][array] = new double[N];
         for (int i=0; i<N; i++)
         chunks[chunk][array][i] = chunk*10.0 + array + 1;
 //        printf("%d: %.2f\n",array,chunks[chunk][array][88]);
     }
     }
     //
     // local storage for conversion
     //
     float* tmpfloat = new float[N];
     //
     // create necessary textures:
     // we need two textures to pingpong the iteration,
     // and one set of input textures (numArraysPerChunk).
     // Pingpong textures are zeroed here already, input textures
     // are just allocated.
     // We could allocate textures for all input arrays (numChunks times
     // numArraysPerChunk), but memory is a precious resource. Allocating
     // them increase memory requirements by factor of 1.5 (in addition to
     // the double precision arrays), since glTexImage() implies a
     // malloc() in driver controlled memory.
     //
     GLuint pingtex[2];
     glGenTextures(2,pingtex);
     // set texture parameters for pingpong textures and zero them
     copy(data,tmpfloat);
     for (int i=0; i<2; i++) {
     glBindTexture(GL_TEXTURE_RECTANGLE_ARB,pingtex[i]);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP);
     glTexImage2D(GL_TEXTURE_RECTANGLE_ARB,0,GL_FLOAT_R32_NV,texSize,texSize,0,GL_LUMINANCE,GL_FLOAT,tmpfloat);
     }
     // create numArraysPerChunk input textures, set parameters and
     // allocate space. do not populate them!
     GLuint inputTextures[numArraysPerChunk];
     glGenTextures (numArraysPerChunk, inputTextures);
     for (int array=0; array<numArraysPerChunk; array++) {
     glBindTexture(GL_TEXTURE_RECTANGLE_ARB,inputTextures[array]);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP);
     glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP);
     glTexImage2D(GL_TEXTURE_RECTANGLE_ARB,0,GL_FLOAT_R32_NV,texSize,texSize,0,GL_LUMINANCE,GL_FLOAT,NULL);
     }
 #ifdef DEBUGDEBUG
     checkErrors("create textures");
 #endif
 #ifdef USE_PBO
     //
     // create buffer objects (nine for download, one for readback)
     //
     GLuint ioBuf[10];
     glGenBuffers(10, ioBuf);
 #endif
     //
     // set up Cg runtime
     //
     CGcontext cgContext = cgCreateContext();
 #ifdef DEBUGDEBUG
     cgSetErrorCallback(cgErrorCallback);
 #endif
     CGprofile fragmentProfile = cgGLGetLatestProfile(CG_GL_FRAGMENT);
     cgGLSetOptimalOptions(fragmentProfile);
     //
     // create kernel program and load it
     //
     CGprogram kernel = cgCreateProgram (cgContext,CG_SOURCE,kernelSource,fragmentProfile,NULL,NULL);
     cgGLLoadProgram (kernel);
     CGparameter accumParam = cgGetNamedParameter (kernel,"accum");
     CGparameter *params = new CGparameter[numArraysPerChunk];
     params[0] = cgGetNamedParameter (kernel,"vec1");
     params[1] = cgGetNamedParameter (kernel,"vec2");
     params[2] = cgGetNamedParameter (kernel,"vec3");
     params[3] = cgGetNamedParameter (kernel,"vec4");
     params[4] = cgGetNamedParameter (kernel,"vec5");
     params[5] = cgGetNamedParameter (kernel,"vec6");
     params[6] = cgGetNamedParameter (kernel,"vec7");
     params[7] = cgGetNamedParameter (kernel,"vec8");
     params[8] = cgGetNamedParameter (kernel,"vec9");
     cgGLEnableProfile(fragmentProfile);
     cgGLBindProgram(kernel);
     //
     // attach ping pong textures to FBO,
     // init pingpong management variables
     //
     GLuint readTex = 0;
     GLuint writeTex = 1;
     GLenum attachmentpoints[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
     glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachmentpoints[readTex], GL_TEXTURE_RECTANGLE_ARB,pingtex[readTex],0);
     glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachmentpoints[writeTex], GL_TEXTURE_RECTANGLE_ARB,pingtex[writeTex],0);
 #ifdef DEBUGDEBUG
     checkErrors("attaching");
 #endif
     //
     // perform computation by looping over all chunks of input data,
     // streaming them in as we proceed.
     //
     printf("Starting calculation\n");
     glFinish();
     double start_compute = clock();
     for (int iter=0; iter<numIterations; iter++) {
     for (int chunk=0; chunk<numChunks; chunk++) {
         //
         // populate input textures for current chunk and transfer
         // data to driver-controlled memory.
         //
         for (int array=0; array<numArraysPerChunk; array++) {
         glBindTexture(GL_TEXTURE_RECTANGLE_ARB, inputTextures[array]);
 #ifdef USE_PBO
         //
         // bind buffer object
         //
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, ioBuf[array]);
         //
         // invalidate this buffer object by passing NULL as data
         //
         // Note: according to the spec at
         // http://www.opengl.org/registry/specs/ARB/vertex_buffer_object.txt
         // GL_STREAM_DRAW hints at the driver that the data store contents
         // will be specified once by the application, and used at most a
         // few times as the source of a GL (drawing) command.
         glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB,texSize*texSize*sizeof(float), NULL, GL_STREAM_DRAW);
         //
         // Acquire a pointer to the first data item in this buffer object
         // by mapping it to the so-called unpack buffer target. The unpack
         // buffer refers to a location in memory that gets "unpacked"
         // from CPU (main) memory into GPU memory, hence the somewhat
         // confusing language.
         // Important: This is a pointer to a chunk of memory in what I
         // like to call "driver-controlled memory". Depending on the
         // driver, this might be PCIe/AGP memory, or ideally onboard
         // memory.
         // GL_WRITE_ONLY tells the GL that while we have control of the
         // memory, we will access it write-only. This allows for internal
         // optimisations and increases our chances to get good performance.
         // If we mapped this buffer read/write, it would almost always be
         // located in system memory, from which reading is much faster.
         //
         void* ioMem = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
         assert(ioMem); // we are in trouble
         //
         // "memcpy" the double precision array into the driver memory,
         // doing the explicit conversion to single precision.
         //
         copy(chunks[chunk][array], ioMem);
         //
         // release memory, i.e. give control back to the driver
         //
         glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB);
         //
         // Since the buffer object is still bound, this call populates our
         // texture by sourcing the buffer object. In effect, this means
         // that ideally no actual memcpy takes place (if the driver
         // decided to map the buffer in onboard memory previously), or at
         // most one memcpy inside driver-controlled memory which is much
         // faster since the previous copy of our data into driver-
         // controlled memory already resulted in laying out the data for
         // optimal performance.
         // In other words: This call is at most a real DMA transfer,
         // without any (expensive) interference by the CPU.
         //
         glTexSubImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, 0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, BUFFER_OFFSET(0));
         //
         // Unbind buffer object by binding it to zero.
         // This call is crucial, as doing the following computations while
         // the buffer object is still bound will result in all sorts of
         // weird side effects, eventually even delivering wrong results.
         // Refer to the specification to learn more about which
         // GL calls act differently while a buffer is bound.
         glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
 #else
         //
         // Do a conventional texture population.
         // Note that this version implies a series of non-DMA memory
         // operations:
         // - conversion from double to single precision is done
         //   explicitely by the CPU
         // - glTexSubImage() implies a memcpy into driver-controlled
         //   memory.
         // - glTexSubImage() implies rearranging the data as the driver
         //   deems necessary.
         //
         // The version using PBOs is hence faster since the CPU is only
         // involved once in the entire process, converting and
         // rearranging in one go.
         // We do definitely save on one memcpy, and eventually a second one
         // if the driver does not decide to DMA the data anyway after we
         // release the pointer / unmap the buffer.
         copy (chunks[chunk][array],tmpfloat);
         glTexSubImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, 0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, tmpfloat);
 #endif
         //
         // assign texture as input to shader
         //
         cgGLSetTextureParameter(params[array], inputTextures[array]);
         cgGLEnableTextureParameter(params[array]);
         }
         for (int iter=0; iter<numKernelSteps; iter++) {
         //
         // assign accum texture (input pingpong) to kernel
         //
         cgGLSetTextureParameter(accumParam, pingtex[readTex]);
         cgGLEnableTextureParameter(accumParam);
         //
         // redirect output to write-only pingpong texture
         //
         glDrawBuffer(attachmentpoints[writeTex]);
         //
         // render viewport-sized quad to perform actual computation
         //
         glBegin(GL_QUADS);
         glTexCoord2f(0.0, 0.0); glVertex2f(0.0, 0.0);
         glTexCoord2f(texSize, 0.0); glVertex2f(texSize, 0.0);
         glTexCoord2f(texSize, texSize); glVertex2f(texSize,texSize);
         glTexCoord2f(0.0, texSize); glVertex2f(0.0, texSize);
         glEnd();
         //
         // swap pingpong input and output textures
         readTex = 1-readTex;
         writeTex = 1-writeTex;
         }
     }
     }
 #ifdef DEBUGDEBUG
     checkErrors("compute");
 #endif
     glFinish();
     double end_compute = clock();
     //
     // Read back results (into double precision arrays).
     //
     printf("Reading back results\n");
     glFinish();
     double start_readback = clock();
 #ifdef USE_PBO
     //
     // Readback using PBOs is performed in a similar way as download is.
     // First, we set one texture attached to our PBO as source for the
     // readback and bind the buffer to the pixel-pack target.
     // Note: PBOs 0--8 were used during download, hence #9
     //
     glReadBuffer(attachmentpoints[readTex]);
     glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, ioBuf[9]);
     //
     // invalidate this buffer by passing NULL
     //
     // Note: according to the spec at
     // http://www.opengl.org/registry/specs/ARB/vertex_buffer_object.txt
     // GL_STREAM_READ hints at the driver that the data store contents
     // will be specified once by the application, and used at most a few
     // times by the application, which is exactly what we will do.
     //
     glBufferData(GL_PIXEL_PACK_BUFFER_ARB,texSize*texSize*sizeof(float), NULL, GL_STREAM_READ);
     //
     // In contrast to conventional glReadPixels(), this call returns
     // immediately and just triggers a DMA transfer into the buffer object we
     // allocated preciously.
     //
     glReadPixels (0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, BUFFER_OFFSET(0));
     //
     // Readbacks are only asynchroneous on the CPU side (since the above call
     // returns immediately). To take advantage of this, a real application
     // would have to perform a lot of independent work on some other data
     // while the data is DMA'ed in the background.
     //
     // In our case, there is no work to be done, so we acquire a pointer to
     // the data by mapping the buffer to the pixel pack target. We again
     // indicate what we want to do with the pointer (accessing the data
     // read-only). Note that this call will block until the data is available.
     //
     void* mem = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
     assert(mem);
     //
     // convert data back to double precision since the underlying "application"
     // will continue using the data in double precision.
     //
     copy(mem,data);
     //
     // Release pointer and buffer object back to driver control, and unbind the
     // buffer object
     //
     glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
     glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
 #else
     //
     // The conventional glReadPixels() will block until all data has been
     // copied into the array we pass in as last argument.
     // This includes the transfer and the eventual rearrangement back to a
     // CPU-friendly (or better: non-GPU-optimised) layout in memory.
     //
     glReadBuffer(attachmentpoints[readTex]);
     glReadPixels(0, 0, texSize, texSize,GL_LUMINANCE,GL_FLOAT,tmpfloat);
     copy(tmpfloat, data);
 #endif
     glFinish();
     double end_readback = clock();
 #ifdef DEBUGDEBUG
     checkErrors("readback");
 #endif
     //
     // do result comparison, exemplarily for array entry 88
     // since I am too lazy to add a random number generator.
     //
     double res = 0;
     for (int iter=0; iter<numIterations; iter++) {
     for (int i=0; i<numChunks; i++)
         for (int j=0; j<numArraysPerChunk; j++)
         for (int k=0; k<numKernelSteps; k++)
             res += i*10.0 + j + 1;
     }
     printf("%d: should be: %.2f, is: %.2f\n",88, res, data[88]);
     //
     // print out timings
     //
     double time_compute  = (end_compute - start_compute)/CLOCKS_PER_SEC;
     double time_readback = (end_readback - start_readback)/CLOCKS_PER_SEC;
     double data_download = (double)N*numChunks*numArraysPerChunk*numIterations*sizeof(double) / (1024.0*1024.0);
     double data_readback = (double)N*sizeof(double) / (1024.0*1024.0);
     printf("DOWNLOAD\n");
     printf("Total amount of doubleprec data : %.2f MB\n", data_download);
     printf("Compute time                    : %.2f sec\n", time_compute);
     printf("Throughput:                     : %.2f MB/s\n", data_download / time_compute);
     printf("READBACK\n");
     printf("Total amount of doubleprec data : %.2f MB\n", data_readback);
     printf("Compute time                    : %.2f sec\n",time_readback);
     printf("Throughput:                     : %.2f MB/s\n", data_readback / time_readback);
     //
     // clean up
     //
     for (int chunk=0; chunk<numChunks; chunk++) {
         for (int array=0; array<numArraysPerChunk; array++)
             delete [] chunks[chunk][array];
         delete [] chunks[chunk];
     }
     delete [] tmpfloat;
     delete [] data;
     delete [] chunks;
     glDeleteFramebuffersEXT (1,&fb);
 #ifdef USE_PBO
     glDeleteBuffers(2, ioBuf);
 #endif
     glDeleteTextures(numArraysPerChunk,inputTextures);
     glDeleteTextures(2,pingtex);

     return 0;
 }
	/*
	*
	* Example code to demonstrate the benefits of PBOs for fast
	* transfers to the GPU.
	*
	* (c) 2006 Dominik Göddeke, University of Dortmund
	* dominik.goeddeke@math.uni-dortmund.de
	*
	* For details, please refer to the tutorial available at
	* www.mathematik.uni-dortmund.de/~goeddeke/gpgpu/tutorial3.html
	*
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <GL/glew.h>
	#include <GL/glut.h>
	#include <Cg/cg.h>
	#include <Cg/cgGL.h>
	#include <time.h>
	#include <assert.h>




	/*********************************************************
	* macros, problem size definition etc. *
	*********************************************************/



	// PBO macro (see spec for details)
	#define BUFFER_OFFSET(i) ((char *)NULL + (i))


	// array size N is texsize*texsize
	static const int texSize = 1025;
	static const int N = texSize*texSize;

	// hard-coded for shader + Cg, do not change this
	static const int numArraysPerChunk = 9;
	// input data size, default setting results in approx 1.6 GB of phys mem.
	static const int numChunks = 20;
	// number of iterations per chunk to emulate kernel workload
	static const int numKernelSteps = 1;
	// number of iterations the test is repeated, to reduce timing noise.
	static const int numIterations = 10;


	// uncomment the following to check for GL and Cg errors
	// Note that enabling this creates an (artificial) sync point (by glGetError())
	//#define DEBUGDEBUG


	//
	// Enable this #define to enable PBOs for glTexSubImage() and glReadPixels().
	// Otherwise, PBOs are not used.
	//
	#define USE_PBO


	// my own memcheck macro, only works on linux. Use with care, this will
	// slow down the application significantly.
	#define MEMCHECK() system("top -b -n1 \| grep test \| awk '{print $5\" \"$6}'")


	/****************************************************************************/

	//
	// forward declaration of some functions
	//
	void cgErrorCallback(void);
	void checkErrors(const char *label);
	void copy(double* in, float* out);
	void copy(double* in, void* out);
	void copy(void* in, double* out);


	/*****************************************************/

	//
	// fragment program: out = accum + vec
	//
	const char* kernelSource = " float main( "\
	" uniform samplerRECT accum, " \
	" uniform samplerRECT vec1, " \
	" uniform samplerRECT vec2, " \
	" uniform samplerRECT vec3, " \
	" uniform samplerRECT vec4, " \
	" uniform samplerRECT vec5, " \
	" uniform samplerRECT vec6, " \
	" uniform samplerRECT vec7, " \
	" uniform samplerRECT vec8, " \
	" uniform samplerRECT vec9, " \
	" in float4 coords:TEXCOORD0) : COLOR { " \
	" return texRECT(accum,coords.xy) + " \
	" texRECT(vec1,coords.xy) + " \
	" texRECT(vec2,coords.xy) + " \
	" texRECT(vec3,coords.xy) + " \
	" texRECT(vec4,coords.xy) + " \
	" texRECT(vec5,coords.xy) + " \
	" texRECT(vec6,coords.xy) + " \
	" texRECT(vec7,coords.xy) + " \
	" texRECT(vec8,coords.xy) + " \
	" texRECT(vec9,coords.xy); } ";

	#ifdef DEBUGDEBUG
	//
	// Cg error callback
	//
	void cgErrorCallback(void) {
	CGerror lastError = cgGetError();
	if(lastError)
	printf(cgGetErrorString(lastError));
	exit(-1);
	}

	//
	// GL error checking
	//
	void checkErrors(const char *label) {
	GLenum errCode;
	const GLubyte *errStr;
	if ((errCode = glGetError()) != GL_NO_ERROR) {
	errStr = gluErrorString(errCode);
	printf("%s: OpenGL ERROR ",label);
	printf((char*)errStr);
	printf("\n");
	exit(-2);
	}
	}
	#endif


	//
	// Conversion functions from double to float, the versions
	// using the void pointers are used with PBOs.
	// For details, see below where PBOs are actually used.
	// Note that I rely on the compiler to memalign these properly.
	// Whoever knows a way to do this operation for variable N
	// (not compile-time constant) in the most efficient way,
	// please drop me a note.
	//
	void copy(double* in, float* out) {
	for (int i=0; i<N; i++)
	out[i] = (float)in[i];
	}
	void copy(double* in, void* out) {
	float* bla = (float*)out;
	for (int i=0; i<N; i++)
	bla[i] = (float)in[i];
	}
	void copy(void* in, double* out) {
	float* bla = (float*)in;
	for (int i=0; i<N; i++)
	out[i] = bla[i];
	}


	//
	// this does all the relevant things
	//
	int main(int argc, char **argv) {
	assert (numArraysPerChunk == 9); // because kernel is hard coded
	//
	// set up glut to get valid GL context and
	// get extension entry points
	//
	glutInit (&argc, argv);
	glutCreateWindow("STREAMING TUTORIAL");
	glewInit();
	//
	// viewport transform for 1:1 'pixel=texel=data' mapping
	//
	glMatrixMode(GL_PROJECTION);
	glLoadIdentity();
	gluOrtho2D(0.0,texSize,0.0,texSize);
	glMatrixMode(GL_MODELVIEW);
	glLoadIdentity();
	glViewport(0,0,texSize,texSize);
	//
	// create FBO and bind it
	//
	GLuint fb;
	glGenFramebuffersEXT(1,&fb);
	glBindFramebufferEXT(GL_FRAMEBUFFER_EXT,fb);
	//
	// create a whole bunch of double precision data
	//
	printf("---------------------------------------------\n");
	printf("Creating input data, if the following amount \n");
	printf("is greater than the physically available RAM,\n");
	printf("press CTRL-C immediately to avoid HD paging. \n");
	// double precision arrays
	double memrequirements = (double)(numChunksnumArraysPerChunk + numChunks)N*sizeof(double);
	// single precision textures
	memrequirements += (numArraysPerChunk + 2) * N * sizeof(float);
	// convert to megabytes
	memrequirements /= (1024.0 * 1024.0);
	printf("Allocating %.2f MB.\n",memrequirements);
	#ifdef USE_PBO
	printf("PBO transfers are enabled.\n");
	#else
	printf("PBO transfers are disabled.\n");
	#endif
	// array to store intial value and final result, initially zeroed
	double* data = new double[N];
	for (int i=0; i<N; i++)
	data[i] = 0.0;
	// input arrays: arrays in chunk one contain values from 1..9,
	// in chunk two from 10..19, and so on
	double* chunks = new double[numChunks];
	for (int chunk=0; chunk<numChunks; chunk++) {
	chunks[chunk] = new double*[numArraysPerChunk];
	for (int array=0; array<numArraysPerChunk; array++) {
	chunks[chunk][array] = new double[N];
	for (int i=0; i<N; i++)
	chunks[chunk][array][i] = chunk*10.0 + array + 1;
	// printf("%d: %.2f\n",array,chunks[chunk][array][88]);
	}
	}
	//
	// local storage for conversion
	//
	float* tmpfloat = new float[N];
	//
	// create necessary textures:
	// we need two textures to pingpong the iteration,
	// and one set of input textures (numArraysPerChunk).
	// Pingpong textures are zeroed here already, input textures
	// are just allocated.
	// We could allocate textures for all input arrays (numChunks times
	// numArraysPerChunk), but memory is a precious resource. Allocating
	// them increase memory requirements by factor of 1.5 (in addition to
	// the double precision arrays), since glTexImage() implies a
	// malloc() in driver controlled memory.
	//
	GLuint pingtex[2];
	glGenTextures(2,pingtex);
	// set texture parameters for pingpong textures and zero them
	copy(data,tmpfloat);
	for (int i=0; i<2; i++) {
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB,pingtex[i]);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP);
	glTexImage2D(GL_TEXTURE_RECTANGLE_ARB,0,GL_FLOAT_R32_NV,texSize,texSize,0,GL_LUMINANCE,GL_FLOAT,tmpfloat);
	}
	// create numArraysPerChunk input textures, set parameters and
	// allocate space. do not populate them!
	GLuint inputTextures[numArraysPerChunk];
	glGenTextures (numArraysPerChunk, inputTextures);
	for (int array=0; array<numArraysPerChunk; array++) {
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB,inputTextures[array]);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP);
	glTexImage2D(GL_TEXTURE_RECTANGLE_ARB,0,GL_FLOAT_R32_NV,texSize,texSize,0,GL_LUMINANCE,GL_FLOAT,NULL);
	}
	#ifdef DEBUGDEBUG
	checkErrors("create textures");
	#endif
	#ifdef USE_PBO
	//
	// create buffer objects (nine for download, one for readback)
	//
	GLuint ioBuf[10];
	glGenBuffers(10, ioBuf);
	#endif
	//
	// set up Cg runtime
	//
	CGcontext cgContext = cgCreateContext();
	#ifdef DEBUGDEBUG
	cgSetErrorCallback(cgErrorCallback);
	#endif
	CGprofile fragmentProfile = cgGLGetLatestProfile(CG_GL_FRAGMENT);
	cgGLSetOptimalOptions(fragmentProfile);
	//
	// create kernel program and load it
	//
	CGprogram kernel = cgCreateProgram (cgContext,CG_SOURCE,kernelSource,fragmentProfile,NULL,NULL);
	cgGLLoadProgram (kernel);
	CGparameter accumParam = cgGetNamedParameter (kernel,"accum");
	CGparameter *params = new CGparameter[numArraysPerChunk];
	params[0] = cgGetNamedParameter (kernel,"vec1");
	params[1] = cgGetNamedParameter (kernel,"vec2");
	params[2] = cgGetNamedParameter (kernel,"vec3");
	params[3] = cgGetNamedParameter (kernel,"vec4");
	params[4] = cgGetNamedParameter (kernel,"vec5");
	params[5] = cgGetNamedParameter (kernel,"vec6");
	params[6] = cgGetNamedParameter (kernel,"vec7");
	params[7] = cgGetNamedParameter (kernel,"vec8");
	params[8] = cgGetNamedParameter (kernel,"vec9");
	cgGLEnableProfile(fragmentProfile);
	cgGLBindProgram(kernel);
	//
	// attach ping pong textures to FBO,
	// init pingpong management variables
	//
	GLuint readTex = 0;
	GLuint writeTex = 1;
	GLenum attachmentpoints[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachmentpoints[readTex], GL_TEXTURE_RECTANGLE_ARB,pingtex[readTex],0);
	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachmentpoints[writeTex], GL_TEXTURE_RECTANGLE_ARB,pingtex[writeTex],0);
	#ifdef DEBUGDEBUG
	checkErrors("attaching");
	#endif
	//
	// perform computation by looping over all chunks of input data,
	// streaming them in as we proceed.
	//
	printf("Starting calculation\n");
	glFinish();
	double start_compute = clock();
	for (int iter=0; iter<numIterations; iter++) {
	for (int chunk=0; chunk<numChunks; chunk++) {
	//
	// populate input textures for current chunk and transfer
	// data to driver-controlled memory.
	//
	for (int array=0; array<numArraysPerChunk; array++) {
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB, inputTextures[array]);
	#ifdef USE_PBO
	//
	// bind buffer object
	//
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, ioBuf[array]);
	//
	// invalidate this buffer object by passing NULL as data
	//
	// Note: according to the spec at
	// http://www.opengl.org/registry/specs/ARB/vertex_buffer_object.txt
	// GL_STREAM_DRAW hints at the driver that the data store contents
	// will be specified once by the application, and used at most a
	// few times as the source of a GL (drawing) command.
	glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB,texSizetexSizesizeof(float), NULL, GL_STREAM_DRAW);
	//
	// Acquire a pointer to the first data item in this buffer object
	// by mapping it to the so-called unpack buffer target. The unpack
	// buffer refers to a location in memory that gets "unpacked"
	// from CPU (main) memory into GPU memory, hence the somewhat
	// confusing language.
	// Important: This is a pointer to a chunk of memory in what I
	// like to call "driver-controlled memory". Depending on the
	// driver, this might be PCIe/AGP memory, or ideally onboard
	// memory.
	// GL_WRITE_ONLY tells the GL that while we have control of the
	// memory, we will access it write-only. This allows for internal
	// optimisations and increases our chances to get good performance.
	// If we mapped this buffer read/write, it would almost always be
	// located in system memory, from which reading is much faster.
	//
	void* ioMem = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
	assert(ioMem); // we are in trouble
	//
	// "memcpy" the double precision array into the driver memory,
	// doing the explicit conversion to single precision.
	//
	copy(chunks[chunk][array], ioMem);
	//
	// release memory, i.e. give control back to the driver
	//
	glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB);
	//
	// Since the buffer object is still bound, this call populates our
	// texture by sourcing the buffer object. In effect, this means
	// that ideally no actual memcpy takes place (if the driver
	// decided to map the buffer in onboard memory previously), or at
	// most one memcpy inside driver-controlled memory which is much
	// faster since the previous copy of our data into driver-
	// controlled memory already resulted in laying out the data for
	// optimal performance.
	// In other words: This call is at most a real DMA transfer,
	// without any (expensive) interference by the CPU.
	//
	glTexSubImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, 0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, BUFFER_OFFSET(0));
	//
	// Unbind buffer object by binding it to zero.
	// This call is crucial, as doing the following computations while
	// the buffer object is still bound will result in all sorts of
	// weird side effects, eventually even delivering wrong results.
	// Refer to the specification to learn more about which
	// GL calls act differently while a buffer is bound.
	glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
	#else
	//
	// Do a conventional texture population.
	// Note that this version implies a series of non-DMA memory
	// operations:
	// - conversion from double to single precision is done
	// explicitely by the CPU
	// - glTexSubImage() implies a memcpy into driver-controlled
	// memory.
	// - glTexSubImage() implies rearranging the data as the driver
	// deems necessary.
	//
	// The version using PBOs is hence faster since the CPU is only
	// involved once in the entire process, converting and
	// rearranging in one go.
	// We do definitely save on one memcpy, and eventually a second one
	// if the driver does not decide to DMA the data anyway after we
	// release the pointer / unmap the buffer.
	copy (chunks[chunk][array],tmpfloat);
	glTexSubImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, 0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, tmpfloat);
	#endif
	//
	// assign texture as input to shader
	//
	cgGLSetTextureParameter(params[array], inputTextures[array]);
	cgGLEnableTextureParameter(params[array]);
	}
	for (int iter=0; iter<numKernelSteps; iter++) {
	//
	// assign accum texture (input pingpong) to kernel
	//
	cgGLSetTextureParameter(accumParam, pingtex[readTex]);
	cgGLEnableTextureParameter(accumParam);
	//
	// redirect output to write-only pingpong texture
	//
	glDrawBuffer(attachmentpoints[writeTex]);
	//
	// render viewport-sized quad to perform actual computation
	//
	glBegin(GL_QUADS);
	glTexCoord2f(0.0, 0.0); glVertex2f(0.0, 0.0);
	glTexCoord2f(texSize, 0.0); glVertex2f(texSize, 0.0);
	glTexCoord2f(texSize, texSize); glVertex2f(texSize,texSize);
	glTexCoord2f(0.0, texSize); glVertex2f(0.0, texSize);
	glEnd();
	//
	// swap pingpong input and output textures
	readTex = 1-readTex;
	writeTex = 1-writeTex;
	}
	}
	}
	#ifdef DEBUGDEBUG
	checkErrors("compute");
	#endif
	glFinish();
	double end_compute = clock();
	//
	// Read back results (into double precision arrays).
	//
	printf("Reading back results\n");
	glFinish();
	double start_readback = clock();
	#ifdef USE_PBO
	//
	// Readback using PBOs is performed in a similar way as download is.
	// First, we set one texture attached to our PBO as source for the
	// readback and bind the buffer to the pixel-pack target.
	// Note: PBOs 0--8 were used during download, hence #9
	//
	glReadBuffer(attachmentpoints[readTex]);
	glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, ioBuf[9]);
	//
	// invalidate this buffer by passing NULL
	//
	// Note: according to the spec at
	// http://www.opengl.org/registry/specs/ARB/vertex_buffer_object.txt
	// GL_STREAM_READ hints at the driver that the data store contents
	// will be specified once by the application, and used at most a few
	// times by the application, which is exactly what we will do.
	//
	glBufferData(GL_PIXEL_PACK_BUFFER_ARB,texSizetexSizesizeof(float), NULL, GL_STREAM_READ);
	//
	// In contrast to conventional glReadPixels(), this call returns
	// immediately and just triggers a DMA transfer into the buffer object we
	// allocated preciously.
	//
	glReadPixels (0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, BUFFER_OFFSET(0));
	//
	// Readbacks are only asynchroneous on the CPU side (since the above call
	// returns immediately). To take advantage of this, a real application
	// would have to perform a lot of independent work on some other data
	// while the data is DMA'ed in the background.
	//
	// In our case, there is no work to be done, so we acquire a pointer to
	// the data by mapping the buffer to the pixel pack target. We again
	// indicate what we want to do with the pointer (accessing the data
	// read-only). Note that this call will block until the data is available.
	//
	void* mem = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
	assert(mem);
	//
	// convert data back to double precision since the underlying "application"
	// will continue using the data in double precision.
	//
	copy(mem,data);
	//
	// Release pointer and buffer object back to driver control, and unbind the
	// buffer object
	//
	glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
	glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
	#else
	//
	// The conventional glReadPixels() will block until all data has been
	// copied into the array we pass in as last argument.
	// This includes the transfer and the eventual rearrangement back to a
	// CPU-friendly (or better: non-GPU-optimised) layout in memory.
	//
	glReadBuffer(attachmentpoints[readTex]);
	glReadPixels(0, 0, texSize, texSize,GL_LUMINANCE,GL_FLOAT,tmpfloat);
	copy(tmpfloat, data);
	#endif
	glFinish();
	double end_readback = clock();
	#ifdef DEBUGDEBUG
	checkErrors("readback");
	#endif
	//
	// do result comparison, exemplarily for array entry 88
	// since I am too lazy to add a random number generator.
	//
	double res = 0;
	for (int iter=0; iter<numIterations; iter++) {
	for (int i=0; i<numChunks; i++)
	for (int j=0; j<numArraysPerChunk; j++)
	for (int k=0; k<numKernelSteps; k++)
	res += i*10.0 + j + 1;
	}
	printf("%d: should be: %.2f, is: %.2f\n",88, res, data[88]);
	//
	// print out timings
	//
	double time_compute = (end_compute - start_compute)/CLOCKS_PER_SEC;
	double time_readback = (end_readback - start_readback)/CLOCKS_PER_SEC;
	double data_download = (double)NnumChunksnumArraysPerChunknumIterationssizeof(double) / (1024.0*1024.0);
	double data_readback = (double)Nsizeof(double) / (1024.01024.0);
	printf("DOWNLOAD\n");
	printf("Total amount of doubleprec data : %.2f MB\n", data_download);
	printf("Compute time : %.2f sec\n", time_compute);
	printf("Throughput: : %.2f MB/s\n", data_download / time_compute);
	printf("READBACK\n");
	printf("Total amount of doubleprec data : %.2f MB\n", data_readback);
	printf("Compute time : %.2f sec\n",time_readback);
	printf("Throughput: : %.2f MB/s\n", data_readback / time_readback);
	//
	// clean up
	//
	for (int chunk=0; chunk<numChunks; chunk++) {
	for (int array=0; array<numArraysPerChunk; array++)
	delete [] chunks[chunk][array];
	delete [] chunks[chunk];
	}
	delete [] tmpfloat;
	delete [] data;
	delete [] chunks;
	glDeleteFramebuffersEXT (1,&fb);
	#ifdef USE_PBO
	glDeleteBuffers(2, ioBuf);
	#endif
	glDeleteTextures(numArraysPerChunk,inputTextures);
	glDeleteTextures(2,pingtex);

	return 0;
	}