View
233
Download
4
Category
Preview:
Citation preview
Programming of multiple GPUs with CUDAand Qt library
Alexey Abramovabramov _at_ physik3.gwdg.de
Georg-August University, Bernstein Center for Computational Neuroscience,
III Physikalisches Institut, Göttingen, Germany
Lecture
Alexey Abramov (BCCN, Göttingen) 11-03-11 2/21
Multi-GPU programming
A host system can have multiple devices. Several host threads can execute device code
on the same device, but by design, a host thread can execute device code on only one
device at any given time. As a consequence, multiple host threads are required to
execute device code on multiple devices.
Alexey Abramov (BCCN, Göttingen)
Multi-GPU programming
In order to issue work to a GPU, a context is established between a CPU thread and the
GPU. Only one context can be active on GPU at a time.
11-03-11 3/21
Alexey Abramov (BCCN, Göttingen)
Multi-GPU programming
Even though a GPU can execute calls from one context at a time, it can belong to
multiple contexts. For example, it is possible for several CPU threads to establish
contexts with the same GPU.
11-03-11 4/21
Alexey Abramov (BCCN, Göttingen)
Multi-GPU programming
A host thread can execute device code on only one device at any given time.
(it will be possible in CUDA 4.0)
11-03-11 5/21
Alexey Abramov (BCCN, Göttingen)
#include <stdlib.h>#include <stdio.h>#include <math.h>#include <multithreading.h>
#include <cutil_inline.h>#include <cuda_runtime_api.h>#include "simpleMultiGPU.h"
typedef struct {
// Device id int device;
// Host-side input data int dataN; float *h_Data;
// Partial sum for this GPU float *h_Sum;
} TGPUplan; 11-03-11 6/21
Alexey Abramov (BCCN, Göttingen)
// Data configurationconst int MAX_GPU_COUNT = 32;const int DATA_N = 1048576 * 32;
int main(int argc, char **argv){
// Solver config TGPUplan plan[MAX_GPU_COUNT];
// GPU reduction results float h_SumGPU[MAX_GPU_COUNT]; bzero(h_SumGPU, MAX_GPU_COUNT * sizeof(float));
// OS thread ID CUTThread threadID[MAX_GPU_COUNT];
// create a timer to measure runtime unsigned int hTimer; cutCreateTimer(&hTimer);
11-03-11 7/21
Alexey Abramov (BCCN, Göttingen)
// get number of available CUDA-capable devices int deviceCount = 0; cudaGetDeviceCount(&deviceCount);
if(deviceCount > MAX_GPU_COUNT) deviceCount = MAX_GPU_COUNT;
printf("CUDA-capable device count: %i\n", deviceCount);
printf("Generating input data...\n\n");
float *h_Data = (float *)malloc(DATA_N * sizeof(float));
for(int i = 0; i < DATA_N; i++) h_Data[i] = (float)rand() / (float)RAND_MAX;
// subdividing input data across GPUs // get data sizes for each GPU for(int i = 0; i < deviceCount; i++) plan[i].dataN = DATA_N / deviceCount;
11-03-11 8/21
Alexey Abramov (BCCN, Göttingen)
// take into account "odd" data sizes for(int i = 0; i < DATA_N % deviceCount; i++) plan[i].dataN++;
// assign data ranges to GPUs int gpuBase = 0;
for(int i = 0; i < deviceCount; i++){
plan[i].device = i; plan[i].h_Data = h_Data + gpuBase; plan[i].h_Sum = h_SumGPU + i; gpuBase += plan[i].dataN; }
// start timing and compute on GPU(s) printf("Computing with %d GPU's...\n", deviceCount);
cutResetTimer(hTimer); cutStartTimer(hTimer);
11-03-11 9/21
Alexey Abramov (BCCN, Göttingen)
// create deviceCount threads for(int i = 0; i < deviceCount; i++) threadID[i] = cutStartThread((CUT_THREADROUTINE)solverThread, (void*)
(plan + i));
cutWaitForThreads(threadID, deviceCount);
float sumGPU = 0;
// get the final sum for(int i = 0; i < deviceCount; i++) sumGPU += h_SumGPU[i];
cutStopTimer(hTimer); printf("GPU Processing time: %f (ms)\n\n", cutGetTimerValue(hTimer));
11-03-11 10/21
Alexey Abramov (BCCN, Göttingen)
// compute on Host CPU printf("Computing with Host CPU...\n\n"); double sumCPU = 0;
for(int i = 0; i < DATA_N; i++) sumCPU += h_Data[i];
// compare GPU and CPU results printf("Comparing GPU and Host CPU results...\n"); double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU); printf(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); printf(" Relative difference: %E \n\n", diff); printf((diff < 1e-5) ? "PASSED\n\n" : "FAILED\n\n");
// cleanup and shutdown printf("Shutting down...\n"); cutDeleteTimer(hTimer); free(h_Data); cudaThreadExit();
11-03-11 11/21
Alexey Abramov (BCCN, Göttingen)
static CUT_THREADPROC solverThread(TGPUplan *plan){
const int BLOCK_N = 32; const int THREAD_N = 256; const int ACCUM_N = BLOCK_N * THREAD_N;
float *d_Data,*d_Sum; float *h_Sum; float sum;
int i;
// set device cudaSetDevice(plan->device);
// allocate memory cudaMalloc((void**)&d_Data, plan->dataN * sizeof(float)); cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)); h_Sum = (float *)malloc(ACCUM_N * sizeof(float);
11-03-11 12/21
Alexey Abramov (BCCN, Göttingen)
// copy input data from CPU cudaMemcpy(d_Data, plan->h_Data, plan->dataN * sizeof(float),
cudaMemcpyHostToDevice);
// perform GPU computations launch_reduceKernel(d_Sum, d_Data, plan->dataN, BLOCK_N, THREAD_N);
// read back GPU results cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost) );
sum = 0;
for(i = 0; i < ACCUM_N; i++) sum += h_Sum[i];
*(plan->h_Sum) = (float)sum;
// shut down this GPU free(h_Sum); cudaFree(d_Sum); cudaFree(d_Data); CUT_THREADEND;
}
11-03-11 13/21
Alexey Abramov (BCCN, Göttingen)
void launch_reduceKernel(float *d_Result, float *d_Input, int N,
int BLOCK_N, int THREAD_N) {
reduceKernel<<<BLOCK_N, THREAD_N>>>(d_Result, d_Input, N)
cudaThreadSynchronize();
}
__global__ static void reduceKernel(float *d_Result, float *d_Input, int N){
const int tid = blockIdx.x * blockDim.x + threadIdx.x; const int threadN = gridDim.x * blockDim.x; float sum = 0;
for(int pos = tid; pos < N; pos += threadN) sum += d_Input[pos];
d_Result[tid] = sum;
} 11-03-11 14/21
Alexey Abramov (BCCN, Göttingen)
class QThread;
// class for Qt thread with a GPU contextclass CDeviceThread: public QThread{
private: TGPUplan *m_pPlan;
protected: void run();
public: CDeviceThread(){}; ~CDeviceThread(){};
void Init(TGPUplan *plan){ m_pPlan = plan; };};
The QThread class provides platform-independent threads.
QThread class for multi-GPU programming
11-03-11 15/21
Alexey Abramov (BCCN, Göttingen)
int main(int argc, char **argv){
CDeviceThread *pThreads[MAX_GPU_COUNT];
…
// create deviceCount threads for(int i = 0; i < deviceCount; i++){ CDeviceThread *pDevice = new CDeviceThread; pDevice->Init(plan+i); pThreads[i] = pDevice; }
// start threads for(int i = 0; i < deviceCount; i++) pThreads[i]->start();
// wait for threads for(int i = 0; i < deviceCount; i++) pThreads[i]->wait();
…11-03-11 16/21
Alexey Abramov (BCCN, Göttingen)
// cleanup
for(int i = 0; i < deviceCount; i++)
delete pThreads[i]; …
}
void CDeviceThread::run(){
std::cout << "CDeviceThread thread ID = " << QThread::currentThreadId()
<< std::endl;
std::cout << "Device = " << m_pPlan->device << std::endl; std::cout << "DataN = " << m_pPlan->dataN << std::endl;
const int BLOCK_N = 32; const int THREAD_N = 256; const int ACCUM_N = BLOCK_N * THREAD_N;
float *d_Data,*d_Sum; float *h_Sum; float sum; 11-03-11 17/21
Alexey Abramov (BCCN, Göttingen)
int i;
// set device
cudaSetDevice(m_pPlan->device);
// allocate memory
cudaMalloc((void**)&d_Data, m_pPlan->dataN * sizeof(float));
cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float));
h_Sum = (float *)malloc(ACCUM_N * sizeof(float));
// copy input data from CPU
cudaMemcpy(d_Data, m_pPlan->h_Data, m_pPlan->dataN * sizeof(float),
cudaMemcpyHostToDevice);
// perform GPU computations
launch_reduceKernel(d_Sum, d_Data, m_pPlan->dataN, BLOCK_N, THREAD_N);
11-03-11 18/21
Alexey Abramov (BCCN, Göttingen)
// read back GPU results
cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float),
cudaMemcpyDeviceToHost);
// finalize GPU reduction for current subvector
sum = 0;
for(i = 0; i < ACCUM_N; i++)
sum += h_Sum[i];
*(m_pPlan->h_Sum) = (float)sum;
// shut down this GPU
free(h_Sum);
cudaFree(d_Sum);
cudaFree(d_Data);
}
11-03-11 19/21
Alexey Abramov (BCCN, Göttingen)
Bibliography
NVIDIA CUDA Programming Guide
CUDA C Best Practices Guide
Qt documentation http://qt.nokia.com/
11-03-11 20/21
Recommended