blob: bd5df77d3db436d162cda2825b37d41ce8d4a21a [file] [log] [blame]
/*
*
* Example code to demonstrate the benefits of PBOs for fast
* transfers to the GPU.
*
* (c) 2006 Dominik Göddeke, University of Dortmund
* dominik.goeddeke@math.uni-dortmund.de
*
* For details, please refer to the tutorial available at
* www.mathematik.uni-dortmund.de/~goeddeke/gpgpu/tutorial3.html
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <GL/glew.h>
#include <GL/glut.h>
#include <Cg/cg.h>
#include <Cg/cgGL.h>
#include <time.h>
#include <assert.h>
/*********************************************************
* macros, problem size definition etc. *
*********************************************************/
// PBO macro (see spec for details)
#define BUFFER_OFFSET(i) ((char *)NULL + (i))
// array size N is texsize*texsize
static const int texSize = 1025;
static const int N = texSize*texSize;
// hard-coded for shader + Cg, do not change this
static const int numArraysPerChunk = 9;
// input data size, default setting results in approx 1.6 GB of phys mem.
static const int numChunks = 20;
// number of iterations per chunk to emulate kernel workload
static const int numKernelSteps = 1;
// number of iterations the test is repeated, to reduce timing noise.
static const int numIterations = 10;
// uncomment the following to check for GL and Cg errors
// Note that enabling this creates an (artificial) sync point (by glGetError())
//#define DEBUGDEBUG
//
// Enable this #define to enable PBOs for glTexSubImage() and glReadPixels().
// Otherwise, PBOs are not used.
//
#define USE_PBO
// my own memcheck macro, only works on linux. Use with care, this will
// slow down the application significantly.
#define MEMCHECK() system("top -b -n1 | grep test | awk '{print $5\" \"$6}'")
/****************************************************************************/
//
// forward declaration of some functions
//
void cgErrorCallback(void);
void checkErrors(const char *label);
void copy(double* in, float* out);
void copy(double* in, void* out);
void copy(void* in, double* out);
/*****************************************************/
//
// fragment program: out = accum + vec
//
const char* kernelSource = " float main( "\
" uniform samplerRECT accum, " \
" uniform samplerRECT vec1, " \
" uniform samplerRECT vec2, " \
" uniform samplerRECT vec3, " \
" uniform samplerRECT vec4, " \
" uniform samplerRECT vec5, " \
" uniform samplerRECT vec6, " \
" uniform samplerRECT vec7, " \
" uniform samplerRECT vec8, " \
" uniform samplerRECT vec9, " \
" in float4 coords:TEXCOORD0) : COLOR { " \
" return texRECT(accum,coords.xy) + " \
" texRECT(vec1,coords.xy) + " \
" texRECT(vec2,coords.xy) + " \
" texRECT(vec3,coords.xy) + " \
" texRECT(vec4,coords.xy) + " \
" texRECT(vec5,coords.xy) + " \
" texRECT(vec6,coords.xy) + " \
" texRECT(vec7,coords.xy) + " \
" texRECT(vec8,coords.xy) + " \
" texRECT(vec9,coords.xy); } ";
#ifdef DEBUGDEBUG
//
// Cg error callback
//
void cgErrorCallback(void) {
CGerror lastError = cgGetError();
if(lastError)
printf(cgGetErrorString(lastError));
exit(-1);
}
//
// GL error checking
//
void checkErrors(const char *label) {
GLenum errCode;
const GLubyte *errStr;
if ((errCode = glGetError()) != GL_NO_ERROR) {
errStr = gluErrorString(errCode);
printf("%s: OpenGL ERROR ",label);
printf((char*)errStr);
printf("\n");
exit(-2);
}
}
#endif
//
// Conversion functions from double to float, the versions
// using the void pointers are used with PBOs.
// For details, see below where PBOs are actually used.
// Note that I rely on the compiler to memalign these properly.
// Whoever knows a way to do this operation for variable N
// (not compile-time constant) in the most efficient way,
// please drop me a note.
//
void copy(double* in, float* out) {
for (int i=0; i<N; i++)
out[i] = (float)in[i];
}
void copy(double* in, void* out) {
float* bla = (float*)out;
for (int i=0; i<N; i++)
bla[i] = (float)in[i];
}
void copy(void* in, double* out) {
float* bla = (float*)in;
for (int i=0; i<N; i++)
out[i] = bla[i];
}
//
// this does all the relevant things
//
int main(int argc, char **argv) {
assert (numArraysPerChunk == 9); // because kernel is hard coded
//
// set up glut to get valid GL context and
// get extension entry points
//
glutInit (&argc, argv);
glutCreateWindow("STREAMING TUTORIAL");
glewInit();
//
// viewport transform for 1:1 'pixel=texel=data' mapping
//
glMatrixMode(GL_PROJECTION);
glLoadIdentity();
gluOrtho2D(0.0,texSize,0.0,texSize);
glMatrixMode(GL_MODELVIEW);
glLoadIdentity();
glViewport(0,0,texSize,texSize);
//
// create FBO and bind it
//
GLuint fb;
glGenFramebuffersEXT(1,&fb);
glBindFramebufferEXT(GL_FRAMEBUFFER_EXT,fb);
//
// create a whole bunch of double precision data
//
printf("---------------------------------------------\n");
printf("Creating input data, if the following amount \n");
printf("is greater than the physically available RAM,\n");
printf("press CTRL-C immediately to avoid HD paging. \n");
// double precision arrays
double memrequirements = (double)(numChunks*numArraysPerChunk + numChunks)*N*sizeof(double);
// single precision textures
memrequirements += (numArraysPerChunk + 2) * N * sizeof(float);
// convert to megabytes
memrequirements /= (1024.0 * 1024.0);
printf("Allocating %.2f MB.\n",memrequirements);
#ifdef USE_PBO
printf("PBO transfers are enabled.\n");
#else
printf("PBO transfers are disabled.\n");
#endif
// array to store intial value and final result, initially zeroed
double* data = new double[N];
for (int i=0; i<N; i++)
data[i] = 0.0;
// input arrays: arrays in chunk one contain values from 1..9,
// in chunk two from 10..19, and so on
double*** chunks = new double**[numChunks];
for (int chunk=0; chunk<numChunks; chunk++) {
chunks[chunk] = new double*[numArraysPerChunk];
for (int array=0; array<numArraysPerChunk; array++) {
chunks[chunk][array] = new double[N];
for (int i=0; i<N; i++)
chunks[chunk][array][i] = chunk*10.0 + array + 1;
// printf("%d: %.2f\n",array,chunks[chunk][array][88]);
}
}
//
// local storage for conversion
//
float* tmpfloat = new float[N];
//
// create necessary textures:
// we need two textures to pingpong the iteration,
// and one set of input textures (numArraysPerChunk).
// Pingpong textures are zeroed here already, input textures
// are just allocated.
// We could allocate textures for all input arrays (numChunks times
// numArraysPerChunk), but memory is a precious resource. Allocating
// them increase memory requirements by factor of 1.5 (in addition to
// the double precision arrays), since glTexImage() implies a
// malloc() in driver controlled memory.
//
GLuint pingtex[2];
glGenTextures(2,pingtex);
// set texture parameters for pingpong textures and zero them
copy(data,tmpfloat);
for (int i=0; i<2; i++) {
glBindTexture(GL_TEXTURE_RECTANGLE_ARB,pingtex[i]);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP);
glTexImage2D(GL_TEXTURE_RECTANGLE_ARB,0,GL_FLOAT_R32_NV,texSize,texSize,0,GL_LUMINANCE,GL_FLOAT,tmpfloat);
}
// create numArraysPerChunk input textures, set parameters and
// allocate space. do not populate them!
GLuint inputTextures[numArraysPerChunk];
glGenTextures (numArraysPerChunk, inputTextures);
for (int array=0; array<numArraysPerChunk; array++) {
glBindTexture(GL_TEXTURE_RECTANGLE_ARB,inputTextures[array]);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_S, GL_CLAMP);
glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_WRAP_T, GL_CLAMP);
glTexImage2D(GL_TEXTURE_RECTANGLE_ARB,0,GL_FLOAT_R32_NV,texSize,texSize,0,GL_LUMINANCE,GL_FLOAT,NULL);
}
#ifdef DEBUGDEBUG
checkErrors("create textures");
#endif
#ifdef USE_PBO
//
// create buffer objects (nine for download, one for readback)
//
GLuint ioBuf[10];
glGenBuffers(10, ioBuf);
#endif
//
// set up Cg runtime
//
CGcontext cgContext = cgCreateContext();
#ifdef DEBUGDEBUG
cgSetErrorCallback(cgErrorCallback);
#endif
CGprofile fragmentProfile = cgGLGetLatestProfile(CG_GL_FRAGMENT);
cgGLSetOptimalOptions(fragmentProfile);
//
// create kernel program and load it
//
CGprogram kernel = cgCreateProgram (cgContext,CG_SOURCE,kernelSource,fragmentProfile,NULL,NULL);
cgGLLoadProgram (kernel);
CGparameter accumParam = cgGetNamedParameter (kernel,"accum");
CGparameter *params = new CGparameter[numArraysPerChunk];
params[0] = cgGetNamedParameter (kernel,"vec1");
params[1] = cgGetNamedParameter (kernel,"vec2");
params[2] = cgGetNamedParameter (kernel,"vec3");
params[3] = cgGetNamedParameter (kernel,"vec4");
params[4] = cgGetNamedParameter (kernel,"vec5");
params[5] = cgGetNamedParameter (kernel,"vec6");
params[6] = cgGetNamedParameter (kernel,"vec7");
params[7] = cgGetNamedParameter (kernel,"vec8");
params[8] = cgGetNamedParameter (kernel,"vec9");
cgGLEnableProfile(fragmentProfile);
cgGLBindProgram(kernel);
//
// attach ping pong textures to FBO,
// init pingpong management variables
//
GLuint readTex = 0;
GLuint writeTex = 1;
GLenum attachmentpoints[] = { GL_COLOR_ATTACHMENT0_EXT, GL_COLOR_ATTACHMENT1_EXT };
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachmentpoints[readTex], GL_TEXTURE_RECTANGLE_ARB,pingtex[readTex],0);
glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, attachmentpoints[writeTex], GL_TEXTURE_RECTANGLE_ARB,pingtex[writeTex],0);
#ifdef DEBUGDEBUG
checkErrors("attaching");
#endif
//
// perform computation by looping over all chunks of input data,
// streaming them in as we proceed.
//
printf("Starting calculation\n");
glFinish();
double start_compute = clock();
for (int iter=0; iter<numIterations; iter++) {
for (int chunk=0; chunk<numChunks; chunk++) {
//
// populate input textures for current chunk and transfer
// data to driver-controlled memory.
//
for (int array=0; array<numArraysPerChunk; array++) {
glBindTexture(GL_TEXTURE_RECTANGLE_ARB, inputTextures[array]);
#ifdef USE_PBO
//
// bind buffer object
//
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, ioBuf[array]);
//
// invalidate this buffer object by passing NULL as data
//
// Note: according to the spec at
// http://www.opengl.org/registry/specs/ARB/vertex_buffer_object.txt
// GL_STREAM_DRAW hints at the driver that the data store contents
// will be specified once by the application, and used at most a
// few times as the source of a GL (drawing) command.
glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB,texSize*texSize*sizeof(float), NULL, GL_STREAM_DRAW);
//
// Acquire a pointer to the first data item in this buffer object
// by mapping it to the so-called unpack buffer target. The unpack
// buffer refers to a location in memory that gets "unpacked"
// from CPU (main) memory into GPU memory, hence the somewhat
// confusing language.
// Important: This is a pointer to a chunk of memory in what I
// like to call "driver-controlled memory". Depending on the
// driver, this might be PCIe/AGP memory, or ideally onboard
// memory.
// GL_WRITE_ONLY tells the GL that while we have control of the
// memory, we will access it write-only. This allows for internal
// optimisations and increases our chances to get good performance.
// If we mapped this buffer read/write, it would almost always be
// located in system memory, from which reading is much faster.
//
void* ioMem = glMapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, GL_WRITE_ONLY);
assert(ioMem); // we are in trouble
//
// "memcpy" the double precision array into the driver memory,
// doing the explicit conversion to single precision.
//
copy(chunks[chunk][array], ioMem);
//
// release memory, i.e. give control back to the driver
//
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER_ARB);
//
// Since the buffer object is still bound, this call populates our
// texture by sourcing the buffer object. In effect, this means
// that ideally no actual memcpy takes place (if the driver
// decided to map the buffer in onboard memory previously), or at
// most one memcpy inside driver-controlled memory which is much
// faster since the previous copy of our data into driver-
// controlled memory already resulted in laying out the data for
// optimal performance.
// In other words: This call is at most a real DMA transfer,
// without any (expensive) interference by the CPU.
//
glTexSubImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, 0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, BUFFER_OFFSET(0));
//
// Unbind buffer object by binding it to zero.
// This call is crucial, as doing the following computations while
// the buffer object is still bound will result in all sorts of
// weird side effects, eventually even delivering wrong results.
// Refer to the specification to learn more about which
// GL calls act differently while a buffer is bound.
glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
#else
//
// Do a conventional texture population.
// Note that this version implies a series of non-DMA memory
// operations:
// - conversion from double to single precision is done
// explicitely by the CPU
// - glTexSubImage() implies a memcpy into driver-controlled
// memory.
// - glTexSubImage() implies rearranging the data as the driver
// deems necessary.
//
// The version using PBOs is hence faster since the CPU is only
// involved once in the entire process, converting and
// rearranging in one go.
// We do definitely save on one memcpy, and eventually a second one
// if the driver does not decide to DMA the data anyway after we
// release the pointer / unmap the buffer.
copy (chunks[chunk][array],tmpfloat);
glTexSubImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, 0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, tmpfloat);
#endif
//
// assign texture as input to shader
//
cgGLSetTextureParameter(params[array], inputTextures[array]);
cgGLEnableTextureParameter(params[array]);
}
for (int iter=0; iter<numKernelSteps; iter++) {
//
// assign accum texture (input pingpong) to kernel
//
cgGLSetTextureParameter(accumParam, pingtex[readTex]);
cgGLEnableTextureParameter(accumParam);
//
// redirect output to write-only pingpong texture
//
glDrawBuffer(attachmentpoints[writeTex]);
//
// render viewport-sized quad to perform actual computation
//
glBegin(GL_QUADS);
glTexCoord2f(0.0, 0.0); glVertex2f(0.0, 0.0);
glTexCoord2f(texSize, 0.0); glVertex2f(texSize, 0.0);
glTexCoord2f(texSize, texSize); glVertex2f(texSize,texSize);
glTexCoord2f(0.0, texSize); glVertex2f(0.0, texSize);
glEnd();
//
// swap pingpong input and output textures
readTex = 1-readTex;
writeTex = 1-writeTex;
}
}
}
#ifdef DEBUGDEBUG
checkErrors("compute");
#endif
glFinish();
double end_compute = clock();
//
// Read back results (into double precision arrays).
//
printf("Reading back results\n");
glFinish();
double start_readback = clock();
#ifdef USE_PBO
//
// Readback using PBOs is performed in a similar way as download is.
// First, we set one texture attached to our PBO as source for the
// readback and bind the buffer to the pixel-pack target.
// Note: PBOs 0--8 were used during download, hence #9
//
glReadBuffer(attachmentpoints[readTex]);
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, ioBuf[9]);
//
// invalidate this buffer by passing NULL
//
// Note: according to the spec at
// http://www.opengl.org/registry/specs/ARB/vertex_buffer_object.txt
// GL_STREAM_READ hints at the driver that the data store contents
// will be specified once by the application, and used at most a few
// times by the application, which is exactly what we will do.
//
glBufferData(GL_PIXEL_PACK_BUFFER_ARB,texSize*texSize*sizeof(float), NULL, GL_STREAM_READ);
//
// In contrast to conventional glReadPixels(), this call returns
// immediately and just triggers a DMA transfer into the buffer object we
// allocated preciously.
//
glReadPixels (0, 0, texSize, texSize, GL_LUMINANCE, GL_FLOAT, BUFFER_OFFSET(0));
//
// Readbacks are only asynchroneous on the CPU side (since the above call
// returns immediately). To take advantage of this, a real application
// would have to perform a lot of independent work on some other data
// while the data is DMA'ed in the background.
//
// In our case, there is no work to be done, so we acquire a pointer to
// the data by mapping the buffer to the pixel pack target. We again
// indicate what we want to do with the pointer (accessing the data
// read-only). Note that this call will block until the data is available.
//
void* mem = glMapBuffer(GL_PIXEL_PACK_BUFFER_ARB, GL_READ_ONLY);
assert(mem);
//
// convert data back to double precision since the underlying "application"
// will continue using the data in double precision.
//
copy(mem,data);
//
// Release pointer and buffer object back to driver control, and unbind the
// buffer object
//
glUnmapBuffer(GL_PIXEL_PACK_BUFFER_ARB);
glBindBuffer(GL_PIXEL_PACK_BUFFER_ARB, 0);
#else
//
// The conventional glReadPixels() will block until all data has been
// copied into the array we pass in as last argument.
// This includes the transfer and the eventual rearrangement back to a
// CPU-friendly (or better: non-GPU-optimised) layout in memory.
//
glReadBuffer(attachmentpoints[readTex]);
glReadPixels(0, 0, texSize, texSize,GL_LUMINANCE,GL_FLOAT,tmpfloat);
copy(tmpfloat, data);
#endif
glFinish();
double end_readback = clock();
#ifdef DEBUGDEBUG
checkErrors("readback");
#endif
//
// do result comparison, exemplarily for array entry 88
// since I am too lazy to add a random number generator.
//
double res = 0;
for (int iter=0; iter<numIterations; iter++) {
for (int i=0; i<numChunks; i++)
for (int j=0; j<numArraysPerChunk; j++)
for (int k=0; k<numKernelSteps; k++)
res += i*10.0 + j + 1;
}
printf("%d: should be: %.2f, is: %.2f\n",88, res, data[88]);
//
// print out timings
//
double time_compute = (end_compute - start_compute)/CLOCKS_PER_SEC;
double time_readback = (end_readback - start_readback)/CLOCKS_PER_SEC;
double data_download = (double)N*numChunks*numArraysPerChunk*numIterations*sizeof(double) / (1024.0*1024.0);
double data_readback = (double)N*sizeof(double) / (1024.0*1024.0);
printf("DOWNLOAD\n");
printf("Total amount of doubleprec data : %.2f MB\n", data_download);
printf("Compute time : %.2f sec\n", time_compute);
printf("Throughput: : %.2f MB/s\n", data_download / time_compute);
printf("READBACK\n");
printf("Total amount of doubleprec data : %.2f MB\n", data_readback);
printf("Compute time : %.2f sec\n",time_readback);
printf("Throughput: : %.2f MB/s\n", data_readback / time_readback);
//
// clean up
//
for (int chunk=0; chunk<numChunks; chunk++) {
for (int array=0; array<numArraysPerChunk; array++)
delete [] chunks[chunk][array];
delete [] chunks[chunk];
}
delete [] tmpfloat;
delete [] data;
delete [] chunks;
glDeleteFramebuffersEXT (1,&fb);
#ifdef USE_PBO
glDeleteBuffers(2, ioBuf);
#endif
glDeleteTextures(numArraysPerChunk,inputTextures);
glDeleteTextures(2,pingtex);
return 0;
}