Programming of multiple GPUs with CUDA and Qt library Alexey Abramov abramov _at_ physik3.gwdg.de...

Programming of multiple GPUs with CUDAand Qt library

Alexey Abramovabramov _at_ physik3.gwdg.de

Georg-August University, Bernstein Center for Computational Neuroscience,

III Physikalisches Institut, Göttingen, Germany

Lecture

Alexey Abramov (BCCN, Göttingen) 11-03-11 2/21

Multi-GPU programming

A host system can have multiple devices. Several host threads can execute device code

on the same device, but by design, a host thread can execute device code on only one

device at any given time. As a consequence, multiple host threads are required to

execute device code on multiple devices.

Alexey Abramov (BCCN, Göttingen)

In order to issue work to a GPU, a context is established between a CPU thread and the

GPU. Only one context can be active on GPU at a time.

11-03-11 3/21

Even though a GPU can execute calls from one context at a time, it can belong to

multiple contexts. For example, it is possible for several CPU threads to establish

contexts with the same GPU.

11-03-11 4/21

A host thread can execute device code on only one device at any given time.

(it will be possible in CUDA 4.0)

11-03-11 5/21

#include <stdlib.h>#include <stdio.h>#include <math.h>#include <multithreading.h>

#include <cutil_inline.h>#include <cuda_runtime_api.h>#include "simpleMultiGPU.h"

typedef struct {

// Device id int device;

// Host-side input data int dataN; float *h_Data;

// Partial sum for this GPU float *h_Sum;

} TGPUplan; 11-03-11 6/21

// Data configurationconst int MAX_GPU_COUNT = 32;const int DATA_N = 1048576 * 32;

int main(int argc, char **argv){

// Solver config TGPUplan plan[MAX_GPU_COUNT];

// GPU reduction results float h_SumGPU[MAX_GPU_COUNT]; bzero(h_SumGPU, MAX_GPU_COUNT * sizeof(float));

// OS thread ID CUTThread threadID[MAX_GPU_COUNT];

// create a timer to measure runtime unsigned int hTimer; cutCreateTimer(&hTimer);

11-03-11 7/21

// get number of available CUDA-capable devices int deviceCount = 0; cudaGetDeviceCount(&deviceCount);

if(deviceCount > MAX_GPU_COUNT) deviceCount = MAX_GPU_COUNT;

printf("CUDA-capable device count: %i\n", deviceCount);

printf("Generating input data...\n\n");

float *h_Data = (float *)malloc(DATA_N * sizeof(float));

for(int i = 0; i < DATA_N; i++) h_Data[i] = (float)rand() / (float)RAND_MAX;

// subdividing input data across GPUs // get data sizes for each GPU for(int i = 0; i < deviceCount; i++) plan[i].dataN = DATA_N / deviceCount;

11-03-11 8/21

// take into account "odd" data sizes for(int i = 0; i < DATA_N % deviceCount; i++) plan[i].dataN++;

// assign data ranges to GPUs int gpuBase = 0;

for(int i = 0; i < deviceCount; i++){

plan[i].device = i; plan[i].h_Data = h_Data + gpuBase; plan[i].h_Sum = h_SumGPU + i; gpuBase += plan[i].dataN; }

// start timing and compute on GPU(s) printf("Computing with %d GPU's...\n", deviceCount);

cutResetTimer(hTimer); cutStartTimer(hTimer);

11-03-11 9/21

// create deviceCount threads for(int i = 0; i < deviceCount; i++) threadID[i] = cutStartThread((CUT_THREADROUTINE)solverThread, (void*)

(plan + i));

cutWaitForThreads(threadID, deviceCount);

float sumGPU = 0;

// get the final sum for(int i = 0; i < deviceCount; i++) sumGPU += h_SumGPU[i];

cutStopTimer(hTimer); printf("GPU Processing time: %f (ms)\n\n", cutGetTimerValue(hTimer));

11-03-11 10/21

// compute on Host CPU printf("Computing with Host CPU...\n\n"); double sumCPU = 0;

for(int i = 0; i < DATA_N; i++) sumCPU += h_Data[i];

// compare GPU and CPU results printf("Comparing GPU and Host CPU results...\n"); double diff = fabs(sumCPU - sumGPU) / fabs(sumCPU); printf(" GPU sum: %f\n CPU sum: %f\n", sumGPU, sumCPU); printf(" Relative difference: %E \n\n", diff); printf((diff < 1e-5) ? "PASSED\n\n" : "FAILED\n\n");

// cleanup and shutdown printf("Shutting down...\n"); cutDeleteTimer(hTimer); free(h_Data); cudaThreadExit();

11-03-11 11/21

static CUT_THREADPROC solverThread(TGPUplan *plan){

const int BLOCK_N = 32; const int THREAD_N = 256; const int ACCUM_N = BLOCK_N * THREAD_N;

float *d_Data,*d_Sum; float *h_Sum; float sum;

int i;

// set device cudaSetDevice(plan->device);

// allocate memory cudaMalloc((void**)&d_Data, plan->dataN * sizeof(float)); cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float)); h_Sum = (float *)malloc(ACCUM_N * sizeof(float);

11-03-11 12/21

// copy input data from CPU cudaMemcpy(d_Data, plan->h_Data, plan->dataN * sizeof(float),

cudaMemcpyHostToDevice);

// perform GPU computations launch_reduceKernel(d_Sum, d_Data, plan->dataN, BLOCK_N, THREAD_N);

// read back GPU results cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float), cudaMemcpyDeviceToHost) );

sum = 0;

for(i = 0; i < ACCUM_N; i++) sum += h_Sum[i];

*(plan->h_Sum) = (float)sum;

// shut down this GPU free(h_Sum); cudaFree(d_Sum); cudaFree(d_Data); CUT_THREADEND;

11-03-11 13/21

void launch_reduceKernel(float *d_Result, float *d_Input, int N,

int BLOCK_N, int THREAD_N) {

reduceKernel<<<BLOCK_N, THREAD_N>>>(d_Result, d_Input, N)

cudaThreadSynchronize();

__global__ static void reduceKernel(float *d_Result, float *d_Input, int N){

const int tid = blockIdx.x * blockDim.x + threadIdx.x; const int threadN = gridDim.x * blockDim.x; float sum = 0;

for(int pos = tid; pos < N; pos += threadN) sum += d_Input[pos];

d_Result[tid] = sum;

} 11-03-11 14/21

class QThread;

// class for Qt thread with a GPU contextclass CDeviceThread: public QThread{

private: TGPUplan *m_pPlan;

protected: void run();

public: CDeviceThread(){}; ~CDeviceThread(){};

void Init(TGPUplan *plan){ m_pPlan = plan; };};

The QThread class provides platform-independent threads.

QThread class for multi-GPU programming

11-03-11 15/21

int main(int argc, char **argv){

CDeviceThread *pThreads[MAX_GPU_COUNT];

// create deviceCount threads for(int i = 0; i < deviceCount; i++){ CDeviceThread *pDevice = new CDeviceThread; pDevice->Init(plan+i); pThreads[i] = pDevice; }

// start threads for(int i = 0; i < deviceCount; i++) pThreads[i]->start();

// wait for threads for(int i = 0; i < deviceCount; i++) pThreads[i]->wait();

…11-03-11 16/21

// cleanup

for(int i = 0; i < deviceCount; i++)

delete pThreads[i]; …

void CDeviceThread::run(){

std::cout << "CDeviceThread thread ID = " << QThread::currentThreadId()

<< std::endl;

std::cout << "Device = " << m_pPlan->device << std::endl; std::cout << "DataN = " << m_pPlan->dataN << std::endl;

const int BLOCK_N = 32; const int THREAD_N = 256; const int ACCUM_N = BLOCK_N * THREAD_N;

float *d_Data,*d_Sum; float *h_Sum; float sum; 11-03-11 17/21

int i;

// set device

cudaSetDevice(m_pPlan->device);

// allocate memory

cudaMalloc((void**)&d_Data, m_pPlan->dataN * sizeof(float));

cudaMalloc((void**)&d_Sum, ACCUM_N * sizeof(float));

h_Sum = (float *)malloc(ACCUM_N * sizeof(float));

// copy input data from CPU

cudaMemcpy(d_Data, m_pPlan->h_Data, m_pPlan->dataN * sizeof(float),

cudaMemcpyHostToDevice);

// perform GPU computations

launch_reduceKernel(d_Sum, d_Data, m_pPlan->dataN, BLOCK_N, THREAD_N);

11-03-11 18/21

// read back GPU results

cudaMemcpy(h_Sum, d_Sum, ACCUM_N * sizeof(float),

cudaMemcpyDeviceToHost);

// finalize GPU reduction for current subvector

sum = 0;

for(i = 0; i < ACCUM_N; i++)

sum += h_Sum[i];

*(m_pPlan->h_Sum) = (float)sum;

// shut down this GPU

free(h_Sum);

cudaFree(d_Sum);

cudaFree(d_Data);

11-03-11 19/21

Bibliography

NVIDIA CUDA Programming Guide

CUDA C Best Practices Guide

Qt documentation http://qt.nokia.com/

11-03-11 20/21

Thank you for your attention !

QUESTIONS ?

Göttingen, 11.03.2011

Programming of multiple GPUs with CUDA and Qt library Alexey Abramov abramov _at_ physik3.gwdg.de...

Documents

St.-Petersburg State Polytechnic University Department of Aerodynamics, St.-Petersburg, Russia A. ABRAMOV, N. IVANOV & E. SMIRNOV Numerical analysis of

Physiological Mechanisms of Color Vision Israel Abramov

Bash Guide for Beginners · 2009. 1. 19. · Bash Guide for Beginners Machtelt Garrels Garrels BVBA Version 1.11 Last updated 20081227

Jason Barkes, Marcelo R. Barrios, Francis Cougard, Paul G ...gwdu05.gwdg.de/~applsw/Parallelrechner/sp_documentation/hw/sg... · Didac Marin, Hari Reddy, Theeraphong Thitayanun

"Single-Entry Window" as a Platform for an OpenCourseWare Repository Alexey ABRAMOV, Maria BULAKINA, Alexey SIGALOV, State Institute of Information Technologies

Alexey Abramov , Christopher Bayer , Claudio Heller

Lecture on the concept of heritability in Plant Breeding ( wlink@gwdg.de , March 2012)

Wolfgang Link wlink@gwdg.de 2021 itoC First pages is

Whata contrastiveanalysisofcopulasentences ... · Whata contrastiveanalysisofcopulasentences cantellusaboutthedevelopmentofthe Romancepronominal addresssystems SASCHA GAGLIA SGAGLIA@GWDG.DE

Research Paper Therapeutic Fluorescent Hybrid …Corresponding authors: Prof. Frauke Alves (falves@gwdg.de)Translational Molecular Imaging, Max-, Planck Institute for Experimental

Investigating behavior and ecology of …...Primate Center (DPZ), Leibniz Institute for Primate Research, Kellnerweg 4, 37077 Göttingen, Germany E-mail: mdammha@gwdg.de 2ETH Zürich,

Air Quality at Bus Stops - Computer Action Teamweb.cecs.pdx.edu/~maf/Journals/2012_Air_Quality _at_ Bus_Stops.pdfstop. Bus stop location is considered to be one of the most important

Enrico Schiattarella - polito.it · enrico _dot_ schiattarella _at_ gmail _dot_ com Thanks! ... In this thesis we present two novel switching architectures, ... 5.2.1 Monolithic DRRM

Metacomputation. A Gentle Introduction Sergei Abramov Ailamazyan Program Systems Institute of RAS Ilya Klyuchnikov Keldysh Institute of Applied Mathematics

LDAP Linux HOWTOldp.mirror.sdv.fr/HOWTO/pdf/LDAP-HOWTO.pdf · LDAP Linux HOWTO Luiz Ernesto Pinheiro Malère v1.10, 2007−03−18 Revision History Revision

Bash Guide for Beginners Shell... · 2017-01-16 · Bash Guide for Beginners Machtelt Garrels Garrels BVBA Version 1.11 Last updated

GWDG – Kurs Parallelrechner-Programmierung mit MPI MPI Eine Einführung Oswald Haan ohaan@gwdg.de

Program Systems Institute of the Russian Academy of Sciences Research Center for Multiprocessor Systems Doctor Sergei Abramov Supercomputers and multiprocessor

Abramov, O., and D.A. Kring, Impact-induced hydrothermal activity on early Mars, J. Geophys. Res

Gwdg.de Site Info - Ethiopian Review – Ethiopian News and ... is ranked number 108,783 in the world according to the Alexa Traffic Rank. Search Engine Marketing (SEM) Opportunities