OPERATING SYSTEMS AND ARCHITECTURES
CS-M98:
COURSEWORK SOLUTION
Benjamin Mora 1Swansea University
Dr. Benjamin Mora
MARKING RANGE
2Benjamin MoraSwansea University
Full understanding of problem and solution (>97) Ready for employment in HPC sector None of you (some very close though)!
Almost there with multithreading. (70 to 97) Just need to see and understand solution. Most students in this category.
Real issues with multithreading concepts, merging temporary results, and few basic C errors (50 to 70) Some hard work is really needed to understand the full solution
<50: Issues with basic (C) programming and algorithmic concepts, including pointers and creating a data-structures Catching-up is crucial!!!
Q1
3Benjamin MoraSwansea University
Alignement of Data.
Similar to lab exercise.
See CPU part 1.
35 marks.
Q1
4Benjamin MoraSwansea University
void AoS_to_SoA (float *image, int x, int y){
imageRed=new float[x*y+PADDING];imageGreen=new float[x*y+PADDING];imageBlue=new float[x*y+PADDING];unsigned long long alignR=(((unsigned long long) *imageRed)&31)/4; unsigned long long alignG=(((unsigned long long)
*imageGreen)&31)/4; unsigned long long alignB=(((unsigned long long) *imageBlue)&31)/4; alignedRed=imageRed+8-alignR; alignedGreen=imageGreen+8-alignG; alignedBlue=imageBlue+8-alignB;float *R=alignedRed;float *G=alignedGreen;float *B=alignedBlue;for (int i=0;i<x*y;i++) {
R[i]=image[3*i];G[i]=image[3*i+1];B[i]=image[3*i+2];
}}
Q2 LOOP FOR K ITERATIONS
5Benjamin MoraSwansea University
for (int k=0;k<knnIterations;k++){//1.init seed sums to 0for (int seed=0;seed<N;seed++){seedSums[0][seed]=0;seedSums[1][seed]=0;seedSums[2][seed]=0;seedCounters[seed]=0;
} …
Q2 THEN
6Benjamin MoraSwansea University
…//2. Determine and compute average of closer seedsfor (int pixel=0;pixel<x*y*3;pixel+=3){float maxDistance=10;int found=-1;for (int seed=0;seed<N;seed++) //Loop to be optimized{float dx=image[pixel+0]-seeds[0][seed];float dy=image[pixel+1]-seeds[1][seed];float dz=image[pixel+2]-seeds[2][seed];float distanceSquare=dx*dx+dy*dy+dz*dz;if (distanceSquare<maxDistance) { //A closer seed has been foundmaxDistance=distanceSquare;found=seed;
}}
Q2 RECOMPUTE NEW SEEDS
7Benjamin MoraSwansea University
//Last step for the iteration: compute average and update the current seed listfor (int seed=0;seed<N;seed++){if (seedCounters[seed]>0.01){seeds[0][seed]=seedSums[0][seed]/seedCounters[seed];seeds[1][seed]=seedSums[1][seed]/seedCounters[seed];seeds[2][seed]=seedSums[2][seed]/seedCounters[seed];
}}…//End of iteration
Q2
8Benjamin MoraSwansea University
Optimizing the inner loop
Process 8 pixels at a time. Compare 8 pixels against one seed! Some were confused and tried 8 pixels vs 8 seeds
Use cmplt and blend to replace condition. 2 blend s instructions needed! Some replicated mask computations!
The part after the inner loop cannot be parallelized though. Still good speed-up using SIMD Especially when # seeds > 32 Many ways to do it. Extra cast computations done by all of you!
Q2
9Benjamin MoraSwansea University
Optimization comes from: Processing 8 pixels at a time. Removing the branch (no if then)
Still tricky to get good speed up.
Going further Loop unrolling. Minimize the number of computations inside the inner loop. Put all constant operations like set1 outside loop.
Avoid shared cache lines when multithreading!
Q2 LOOP FOR K ITERATIONS
10Benjamin MoraSwansea University
float seedSums[3][N];float seedCounters[N];//Seed initialization;for(int j=0;j<3;j++)
for(int i=0;i<N;i++)seeds[j][i]=(rand()+0.5f)/(RAND_MAX+1.f);
for (int k=0;k<knnIterations;k++){
for (int seed=0;seed<N;seed++){
seedSums[0][seed]=0;seedSums[1][seed]=0;seedSums[2][seed]=0;seedCounters[seed]=0;
}
Q2 LOOP FOR K ITERATIONS
11Benjamin MoraSwansea University
float seedSums[3][N];float seedCounters[N];float8 seedId[N];for (int seed=0;seed<N;seed++) seedId[seed]=set1((float &) seed);
for(int j=0;j<3;j++)for(int i=0;i<N;i++)seeds[j][i]=(rand()+0.5f)/(RAND_MAX+1.f);
for (int k=0;k<knnIterations;k++){float8 seeds8[3][N];for (int seed=0;seed<N;seed++){seedSums[0][seed]=0;seedSums[1][seed]=0;seedSums[2][seed]=0;seedCounters[seed]=0;seeds8[0][seed]=set1(seeds[0][seed]);seeds8[1][seed]=set1(seeds[1][seed]);seeds8[2][seed]=set1(seeds[2][seed]);
}
Q2 THEN
12Benjamin MoraSwansea University
…//2. Determine and compute average of closer seedsfor (int pixel=0;pixel<x*y*3;pixel+=3){float maxDistance=10;int found=-1;for (int seed=0;seed<N;seed++) //Loop to be optimized{float dx=image[pixel+0]-seeds[0][seed];float dy=image[pixel+1]-seeds[1][seed];float dz=image[pixel+2]-seeds[2][seed];float distanceSquare=dx*dx+dy*dy+dz*dz;if (distanceSquare<maxDistance) { //A closer seed has been foundmaxDistance=distanceSquare;found=seed;
}}
Q2 THEN
13Benjamin MoraSwansea University
float8 *R=(float8 *) alignedRed;float8 *G=(float8 *) alignedGreen;float8 *B=(float8 *) alignedBlue;for (int pixel=0;pixel<x*y;pixel+=8){float8 maxDistance=set1(10);float8 found8=set1(-1.f); //Just for initializationfor (int seed=0;seed<N;seed++) //Loop to be optimized{float8 dx=sub8(R[0],seeds8[0][seed]);float8 dy=sub8(G[0],seeds8[1][seed]);float8 dz=sub8(B[0],seeds8[2][seed]);
float8 distanceSquare=add8(add8(mul8(dx,dx),mul8(dy,dy)),mul8(dz,dz));
float8 comparison=cmplt8(distanceSquare,maxDistance);maxDistance=blend8(maxDistance,distanceSquare,comparison);found8=blend8(found8,seedId[seed],comparison);
}
Q2 THEN
14Benjamin MoraSwansea University
//Sum the pixel values to the appropriate seedfor (int i=0;i<8;i++){int found=(int&) found8.m256_f32[i];seedCounters[found]+=1.;seedSums[0][found]+=((float *) R)[i];seedSums[1][found]+=((float *) G)[i];seedSums[2][found]+=((float *) B)[i];
}R++;G++;B++;
}…
Q2 RECOMPUTE NEW SEEDS
15Benjamin MoraSwansea University
Still the same!!!//Last step for the iteration: compute average and update the current seed listfor (int seed=0;seed<N;seed++){if (seedCounters[seed]>0.01){seeds[0][seed]=seedSums[0][seed]/seedCounters[seed];seeds[1][seed]=seedSums[1][seed]/seedCounters[seed];seeds[2][seed]=seedSums[2][seed]/seedCounters[seed];
}}…//End of iteration
Q3
16Benjamin MoraSwansea University
Most of you got the principles more or less rightPractical implementation was wrong!Barriers were sometimes at the wrong location.Most of you added extra, unneeded barriers.Mutex have been accepted.
Putting a lock on every seed change is too much/not good!
Errors:Only using results from one thread at each iteration.
Q3 IDEA
17Benjamin MoraSwansea University
Break down image in 4 piecesFor each thread iteration:
Copy seeds in local variables (Performance)Loop for the current chunk of pixels.
Compute seedSums and seeCounters the same way.Copy results in globally visible but separate variables.BarrierOne thread
Adds results from other threads to its own resultsThen Compute RGB average and update seeds.
Barrier
Q3 CREATING THREADS
18Benjamin MoraSwansea University
void knnCompressionSIMDPosix(float *image, int x, int y){AoS_to_SoA(image,x,y);threadJobSize=x*y/nbThreads;pthread_t threads[nbThreads];pthread_barrier_init(&barrier, NULL, nbThreads);for (int i=0;i<nbThreads;i++)pthread_create(&threads[i], NULL, posixThread, (void *)
i);for (int i=0;i<nbThreads;i++) //separate looppthread_join(threads[i], NULL);
}
Q3 THREAD’S JOB
19Benjamin MoraSwansea University
void * posixThread(void *arg){long long threadNumber=(long long) arg;int firstPixel=threadNumber*threadJobSize;int lastPixel=firstPixel+threadJobSize;float seedSums[3][N];float seedCounters[N];
//Seed initialization;float8 seedId[N];for (int seed=0;seed<N;seed++)seedId[seed]=set1((float &) seed);if (threadNumber==0)for(int j=0;j<3;j++)for(int i=0;i<N;i++)seeds[j][i]=(rand()+0.5f)/(RAND_MAX+1.f);
pthread_barrier_wait(&barrier);
Q3 THREAD’S JOB
20Benjamin MoraSwansea University
for (int k=0;k<knnIterations;k++){… Seed initalization is the samefloat8 *R=(float8 *) (alignedRed+firstPixel);float8 *G=(float8 *) (alignedGreen+firstPixel);float8 *B=(float8 *) (alignedBlue+firstPixel);for (int pixel=firstPixel;pixel<lastPixel;pixel+=8){… loop code does not change…R++;G++;B++;
}
Q3 MERGING RESULTS
21Benjamin MoraSwansea University
for (int seed=0;seed<N;seed++){temporaryResults[threadNumber][0][seed]=seedSums[0]
[seed];temporaryResults[threadNumber][1][seed]=seedSums[1]
[seed];temporaryResults[threadNumber][2][seed]=seedSums[2]
[seed];temporaryCounters[threadNumber][seed]=seedCounters[seed];
}pthread_barrier_wait(&barrier);
Q3 MERGING RESULTS
22Benjamin MoraSwansea University
if (threadNumber==0){for (int thread=1;thread<nbThreads;thread++)for (int seed=0;seed<N;seed++){
temporaryResults[0][0][seed]+=temporaryResults[thread][0][seed];temporaryResults[0][1][seed]+=temporaryResults[thread][1][seed];temporaryResults[0][2][seed]+=temporaryResults[thread][2][seed];temporaryCounters[0][seed]+=temporaryCounters[thread][seed];
}…
Q3 MERGING RESULTS
23Benjamin MoraSwansea University
for (int seed=0;seed<N;seed++){if (temporaryCounters[0][seed]>0.01){seeds[0][seed]=temporaryResults[0][0][seed]
/temporaryCounters[0][seed];seeds[1][seed]=temporaryResults[0][1]
[seed] /temporaryCounters[0]
[seed];seeds[2][seed]=temporaryResults[0][2]
[seed]/temporaryCounters[0][seed];
}}
} //end condition threadNumber==0pthread_barrier_wait(&barrier); //end of iteration, seeds have been updated!