Upload
others
View
3
Download
0
Embed Size (px)
Citation preview
GPGPUIGAD – 2014/2015
Lecture 4
Jacco Bikker
Today:
Demo time!
Parallel scan
Parallel sort
Assignment
Demo Time
Parallel scan
What it is:
in: 1 1 6 2 7 3 2out: 0 1 2 8 10 17 20
C++:
out[0] = 0for ( i = 1; i < n; i++ ) out[i] = in[i-1] + out[i-1];
Parallel scan
What it is good for:
Building block for many parallel algorithms:
Output to array of variable number of elements per thread
Summed area tables
Compaction
Variable output
Verlet fluid solver:
Go over the cells, and create an array of all particle pairs. Process this array in a
second pass (which will have full GPU utilization).
Each cell will emit 0..MAXPARTICLES-1 entries in the output array.
wi wi wi wi wi wi wi wi warp 0
0 2 4 0 1 5 0 1
0 0 2 6 6 7 7 7
Summed Area Tables
What it is:
A table containing, for each pixel P of an
image, the sum of all pixels between (0,0)
and P.
Using a SAT, we can calculate an arbitrary-width box filter in O(1):
Compaction
What it is:
When in a multi-pass algorithm not all data requires the same number
of passes, compaction ensures that subsequent passes have full warps.
What it is good for:
Whitted-style ray tracing. wi wi wi wi wi wi wi wi warp 0
Parallel scan: Algorithm
for ( d = 1; d < log2n; d++ )
for all k in parallel do
if k >= 2d
x[k] += x[k – 2d-1]
O(n log n)
Algorithm (2)
Up-sweep
Down-sweep
O(n)
Today:
Demo time!
Parallel scan
Parallel sort
Assignment
Parallel sort
Selection sort: __kernel void Sort( __global int* in, __global int* out ){int i = get_global_id( 0 );int n = get_global_size( 0 );int iKey = in[i];// compute position of in[i] in outputint pos = 0;for( int j = 0; j < n; j++ ){int jKey = in[j]; // broadcastedbool smaller = (jKey < iKey) || (jKey == iKey && j < i);pos += (smaller) ? 1 : 0;
}out[pos] = iKey;
}
Parallel sort
Merge sort:
Parallel sort
Parallel merge sort:
Main operation:
merge
if (*a < *b)*d++ = *a++;
else*d++ = *b++;
Parallel sort
Parallel merge sort:
Main operation:
merge
while (a < a_end && b < b_end)if (*a < *b)
*d++ = *a++;else
*d++ = *b++;while (a < a_end) *d++ = *a++;while (b < b_end) *d++ = *b++;
Parallel merge
What it is:
Given two sorted sequences a and b, produce sorted sequence c:
Note: position of ai in c is i + f( b, ai ) where f is the number of elements
in b smaller than ai.
Since b is sorted, finding f( b, x ) can be done using a binary search.
0 3 5 5 7 8 8 8a: 1 2 3 4 4 5 8 9b:
0 1 2 3 3 4 4 5 5 5 7 8 8 8 8 9c:
Sorting Networks
http://en.wikipedia.org/wiki/Sorting_network
Sorting Networks
Bitonic Sort
http://en.wikipedia.org/wiki/Bitonic_sorter
Bitonic Sort
__kernel void Sort( __global uint* data, const uint stage, const uint passOfStage,const uint width, const uint direction )
{uint sortDir = direction;const uint idx = get_global_id( 0 );const uint pairDist = 1 << (stage - passOfStage);const uint leftId = (idx % pairDist) + (idx / pairDist) * 2 * pairDist;const uint rightId = leftId + pairDist;const uint A = data[leftId];const uint B = data[rightId];sortDir = ((idx >> stage) & 1) == 1 ? (1 - sortDir) : sortDir;const uint greater = A > B ? A : B;const uint lesser = A > B ? B : A;data[leftId] = sortDir ? lesser : greater;data[rightId] = sortDir ? greater : lesser;
}
Today:
Demo time!
Parallel scan
Parallel sort
Updated Template
Assignment
Template v3
#define CHECKCL(r) CheckCL( r, __FILE__, __LINE__ )
float GetTime();void StartTimer();float GetDuation();
Template v3
static cl_int getPlatformID( cl_platform_id* platform ){
char chBuffer[1024];cl_uint num_platforms, devCount; cl_platform_id* clPlatformIDs;cl_int error;*platform = NULL;CHECKCL( error = clGetPlatformIDs( 0, NULL, &num_platforms ) );if (num_platforms == 0) CHECKCL( -1 );clPlatformIDs = (cl_platform_id*)malloc( num_platforms * sizeof( cl_platform_id ) );error = clGetPlatformIDs( num_platforms, clPlatformIDs, NULL );
#ifdef USE_CPU_DEVICEcl_uint deviceType[2] = { CL_DEVICE_TYPE_CPU, CL_DEVICE_TYPE_CPU };char* deviceOrder[2][3] = { { "", "", "" }, { "", "", "" } };
#elsecl_uint deviceType[2] = { CL_DEVICE_TYPE_GPU, CL_DEVICE_TYPE_CPU };char* deviceOrder[2][3] = { { "NVIDIA", "AMD", "" }, { "", "", "" } };
#endif...
Template v3
glTexImage2D( textureType, 0, GL_RGBA32F, width, height, 0, GL_RGB, GL_FLOAT, data );glTexParameteri( textureType, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );glTexParameteri( textureType, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );glTexParameteri( textureType, GL_TEXTURE_MIN_FILTER, GL_NEAREST );glTexParameteri( textureType, GL_TEXTURE_MAG_FILTER, GL_NEAREST );
glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE );glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE );glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST );glTexParameteri( GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST );glTexImage2D( GL_TEXTURE_2D, 0, GL_RGBA32F, width, height, 0, GL_RGB, GL_FLOAT, data );
Template v3
class Buffer{public:
enum{
DEFAULT = 0,TEXTURE
};// constructor / destructorBuffer() : hostBuffer( 0 ) {}Buffer( unsigned int N, unsigned int t = DEFAULT );~Buffer();cl_mem* GetDevicePtr() { return &deviceBuffer; }unsigned int* GetHostPtr() { return hostBuffer; }void CopyToDevice();void CopyFromDevice();void CopyTo( Buffer* buffer );cl_int ParallelScan();cl_int ParallelSort();...
Today:
Demo time!
Parallel scan
Parallel sort
Assignment
Assignment
Assignment
Some options:
Fluid simulation with surface reconstruction
Cloth simulation
Flocking / Boids
Library of sorting functions for varying data sets, with analysis
Ray traced shadows for rasterizer
Mesh compression / decompression
“The End”(for now)
Next week:
Development tools
Debugging
Random numbers
Bonus material
Merge
Sort in
OpenCL
__kernel void Sort( __global const int* in, __global int* out, __local int* aux ){
int i = get_local_id(0); // index in workgroupint wg = get_local_size(0); // workgroup size = block size, power of 2int offset = get_group_id(0) * wg; in += offset; out += offset; // move in, out to block startaux[i] = in[i]; // load block in aux[wg]barrier(CLK_LOCAL_MEM_FENCE); // make sure AUX is entirely up to date// now we will merge sub-sequences of length 1,2,...,wg/2for( int length = 1; length < wg; length <<=1 ){
uint iKey = aux[i];int ii = i & (length - 1); // index in our sequence in 0..length-1int sibling = (i - ii) ^ length; // beginning of the sibling sequenceint pos = 0;for (int inc = length; inc > 0; inc >>=1 ) // increment for dichotomic search{
int j = sibling + pos + inc - 1;uint jKey = aux[j];bool smaller = (jKey < iKey) || ( jKey == iKey && j < i );pos += (smaller) ? Inc : 0;pos = min( pos, length );
}int bits = 2 * length - 1; // mask for destinationint dest = ((ii + pos) & bits) | (i & ~bits); // dest idx in merged sequencebarrier(CLK_LOCAL_MEM_FENCE);aux[dest] = iKey;barrier(CLK_LOCAL_MEM_FENCE);
}out[i] = aux[i]; // write output
}