GPGPU - Universiteit Utrecht 4.pdf · Variable output Verlet fluid solver: Go over the cells, and create an array of all particle pairs. Process this array in a second pass (which

GPGPUIGAD – 2014/2015

Lecture 4

Jacco Bikker

Today:

Demo time!

Parallel scan

Parallel sort

Assignment

Demo Time

Parallel scan

What it is:

in: 1 1 6 2 7 3 2out: 0 1 2 8 10 17 20

C++:

out[0] = 0for ( i = 1; i < n; i++ ) out[i] = in[i-1] + out[i-1];

Parallel scan

What it is good for:

Building block for many parallel algorithms:

Output to array of variable number of elements per thread

Summed area tables

Compaction

Variable output

Verlet fluid solver:

Go over the cells, and create an array of all particle pairs. Process this array in a

second pass (which will have full GPU utilization).

Each cell will emit 0..MAXPARTICLES-1 entries in the output array.

wi wi wi wi wi wi wi wi warp 0

0 2 4 0 1 5 0 1

0 0 2 6 6 7 7 7

Summed Area Tables

What it is:

A table containing, for each pixel P of an

image, the sum of all pixels between (0,0)

and P.

Using a SAT, we can calculate an arbitrary-width box filter in O(1):

Compaction

What it is:

When in a multi-pass algorithm not all data requires the same number

of passes, compaction ensures that subsequent passes have full warps.

What it is good for:

Whitted-style ray tracing. wi wi wi wi wi wi wi wi warp 0

Parallel scan: Algorithm

for ( d = 1; d < log2n; d++ )

for all k in parallel do

if k >= 2d

x[k] += x[k – 2d-1]

O(n log n)

Algorithm (2)

Up-sweep

Down-sweep

O(n)

Today:

Demo time!

Parallel scan

Parallel sort

Assignment

Parallel sort

Selection sort: __kernel void Sort( __global int* in, __global int* out ){int i = get_global_id( 0 );int n = get_global_size( 0 );int iKey = in[i];// compute position of in[i] in outputint pos = 0;for( int j = 0; j < n; j++ ){int jKey = in[j]; // broadcastedbool smaller = (jKey < iKey) || (jKey == iKey && j < i);pos += (smaller) ? 1 : 0;

}out[pos] = iKey;

}

Parallel sort

Merge sort:

Parallel sort

Parallel merge sort:

Main operation:

merge

if (*a < *b)*d++ = *a++;

else*d++ = *b++;

Parallel sort

Parallel merge sort:

Main operation:

merge

while (a < a_end && b < b_end)if (*a < *b)

*d++ = *a++;else

*d++ = *b++;while (a < a_end) *d++ = *a++;while (b < b_end) *d++ = *b++;

Parallel merge

What it is:

Given two sorted sequences a and b, produce sorted sequence c:

Note: position of ai in c is i + f( b, ai ) where f is the number of elements

in b smaller than ai.

Since b is sorted, finding f( b, x ) can be done using a binary search.

0 3 5 5 7 8 8 8a: 1 2 3 4 4 5 8 9b:

0 1 2 3 3 4 4 5 5 5 7 8 8 8 8 9c:

Sorting Networks

http://en.wikipedia.org/wiki/Sorting_network

http://en.wikipedia.org/wiki/Sorting_network

Sorting Networks

Bitonic Sort

http://en.wikipedia.org/wiki/Bitonic_sorter

http://en.wikipedia.org/wiki/Bitonic_sorter

Bitonic Sort

__kernel void Sort( __global uint* data, const uint stage, const uint passOfStage,const uint width, const uint direction )

{uint sortDir = direction;const uint idx = get_global_id( 0 );const uint pairDist = 1 << (stage - passOfStage);const uint leftId = (idx % pairDist) + (idx / pairDist) * 2 * pairDist;const uint rightId = leftId + pairDist;const uint A = data[leftId];const uint B = data[rightId];sortDir = ((idx >> stage) & 1) == 1 ? (1 - sortDir) : sortDir;const uint greater = A > B ? A : B;const uint lesser = A > B ? B : A;data[leftId] = sortDir ? lesser : greater;data[rightId] = sortDir ? greater : lesser;

}

Today:

Demo time!

Parallel scan

Parallel sort

Updated Template

Assignment

Template v3

#define CHECKCL(r) CheckCL( r, __FILE__, __LINE__ )

float GetTime();void StartTimer();float GetDuation();

Template v3

static cl_int getPlatformID( cl_platform_id* platform ){

char chBuffer[1024];cl_uint num_platforms, devCount; cl_platform_id* clPlatformIDs;cl_int error;*platform = NULL;CHECKCL( error = clGetPlatformIDs( 0, NULL, &num_platforms ) );if (num_platforms == 0) CHECKCL( -1 );clPlatformIDs = (cl_platform_id*)malloc( num_platforms * sizeof( cl_platform_id ) );error = clGetPlatformIDs( num_platforms, clPlatformIDs, NULL );

#ifdef USE_CPU_DEVICEcl_uint deviceType[2] = { CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_CPU };char* deviceOrder[2][3] = { { "", "", "" }, { "", "", "" } };

#elsecl_uint deviceType[2] = { CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU };char* deviceOrder[2][3] = { { "NVIDIA", "AMD", "" }, { "", "", "" } };

#endif...

Template v3

glTexImage2D( textureType, 0, GL_RGBA32F, width, height, 0, GL_RGB, GL_FLOAT, data );glTexParameteri( textureType, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );glTexParameteri( textureType, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );glTexParameteri( textureType, GL_TEXTURE_MIN_FILTER, GL_NEAREST );glTexParameteri( textureType, GL_TEXTURE_MAG_FILTER, GL_NEAREST );

glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA32F, width, height, 0, GL_RGB, GL_FLOAT, data );

Template v3

class Buffer{public:

enum{

DEFAULT = 0,TEXTURE

};// constructor / destructorBuffer() : hostBuffer( 0 ) {}Buffer( unsigned int N, unsigned int t = DEFAULT );~Buffer();cl_mem* GetDevicePtr() { return &deviceBuffer; }unsigned int* GetHostPtr() { return hostBuffer; }void CopyToDevice();void CopyFromDevice();void CopyTo( Buffer* buffer );cl_int ParallelScan();cl_int ParallelSort();...

Today:

Demo time!

Parallel scan

Parallel sort

Assignment

Assignment

Assignment

Some options:

Fluid simulation with surface reconstruction

Cloth simulation

Flocking / Boids

Library of sorting functions for varying data sets, with analysis

Ray traced shadows for rasterizer

Mesh compression / decompression

“The End”(for now)

Next week:

Development tools

Debugging

Random numbers

Bonus material

Merge

Sort in

OpenCL

__kernel void Sort( __global const int* in, __global int* out, __local int* aux ){

int i = get_local_id(0); // index in workgroupint wg = get_local_size(0); // workgroup size = block size, power of 2int offset = get_group_id(0) * wg; in += offset; out += offset; // move in, out to block startaux[i] = in[i]; // load block in aux[wg]barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date// now we will merge sub-sequences of length 1,2,...,wg/2for( int length = 1; length < wg; length <<=1 ){

uint iKey = aux[i];int ii = i & (length - 1); // index in our sequence in 0..length-1int sibling = (i - ii) ^ length; // beginning of the sibling sequenceint pos = 0;for (int inc = length; inc > 0; inc >>=1 ) // increment for dichotomic search{

int j = sibling + pos + inc - 1;uint jKey = aux[j];bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );pos += (smaller) ? Inc : 0;pos = min( pos, length );

}int bits = 2 * length - 1; // mask for destinationint dest = ((ii + pos) & bits) | (i & ~bits); // dest idx in merged sequencebarrier(CLK_LOCAL_MEM_FENCE);aux[dest] = iKey;barrier(CLK_LOCAL_MEM_FENCE);

}out[i] = aux[i]; // write output

}

Documents

GPGPU - Universiteit Utrecht 4.pdf · Variable output Verlet fluid solver: Go over the cells, and create an array of all particle pairs. Process this array in a second pass (which