• • •

Anuncio
Analysis and Design of Parallel Algorithms
M. en C. Sandra Luz Morales Güitrón.
Proyecto para
Extraordinario
Nombre del alumno(a):_____________________________________________



Página 1|6
o
o
o
o
o
Página 2|6
#include <time.h>
#include <cuda.h>
#include <stdio.h>
#define STOP 0
#define START 1
#define BLOCK_X 16
#define BLOCK_Y 16
extern "C" void chrono (int kind, float *time);
__global__ void kconvol (float *gpu_a, float *gpu_b, int pitch, int n) {
int ig, jg, lg, il, jl, ll;
__shared__ float la[(BLOCK_X+2)*(BLOCK_Y+2)];
__shared__ float lb[(BLOCK_X+2)*(BLOCK_Y+2)];
// A thread now has two sets of coordinates :
// (ig, jg) in the global array
// (il, jl) in the local array (shared) of size (BLOCK_X+2)*(BLOCK_Y+2)
ig = blockDim.x*blockIdx.x+threadIdx.x;
jg = blockDim.y*blockIdx.y+threadIdx.y;
lg = ig+jg*pitch;
// UP TO YOU : write below the indices il and jl
il = ?...
jl = ?...
ll = il+jl*(BLOCK_X+2);
// What does the following line correspond to ?
la[ll] = gpu_a[lg];
if ((il == 1) && (ig > 0)) // What does the following line correespond to ?
la[ll-1] = gpu_a[lg-1];
if ((jl == 1) && (jg > 0))
la[ll-BLOCK_X-2] = gpu_a[lg-pitch];
if ((il == BLOCK_X) && (ig < n-1)) // UP TO YOU The following line is missing.
Find out what was intended
//??...............??
if ((jl == BLOCK_Y) && (jg < n-1)) // UP TO YOU Find out the missing offset
of local array below
la[ll+/*MISSING*/] = gpu_a[lg+pitch];
__syncthreads ();
if ((ig >= n) || (jg >= n)) return;
if ((ig == 0) || (jg == 0) || (ig == n-1) || (jg == n-1)) {
lb[ll] = la[ll];
}
else
/* UP TO YOU : fill up below the missing indices */
lb[ll]=(1.f/5.f)*(
+la[/* MISSING */]+
\
la[ll-1]
+la[ll]
+la[ll+1]+ \
+la[/* MISSING */]);
gpu_b[lg] = lb[ll];
}
extern "C" void gpu_convol (float *a, float *b, int n) {
float *gpu_a;
float *gpu_b;
Página 3|6
cudaError_t err;
size_t pitch;
float time;
err = cudaMallocPitch (&gpu_a, &pitch, n*sizeof(float), n);
if (err != 0) {
printf ("Error allocating gpu_a: %s\n", cudaGetErrorString (err));
exit (1);
}
err = cudaMallocPitch (&gpu_b, &pitch, n*sizeof(float), n);
if (err != 0) {
printf ("Error allocating gpu_b: %s\n", cudaGetErrorString (err));
exit (1);
}
dim3 block (BLOCK_X, BLOCK_Y);
dim3 grid;
grid.x = (n-1)/BLOCK_X+1;
grid.y = (n-1)/BLOCK_Y+1;
cudaMemcpy2D (gpu_a, pitch, a, n*sizeof(float), n*sizeof(float), n,
cudaMemcpyHostToDevice);
chrono (START, &time);
kconvol <<<grid, block>>> (gpu_a, gpu_b, pitch/sizeof(float), n);
err=cudaThreadSynchronize ();
chrono (STOP, &time);
printf ("Convolution took %f sec. on GPU\n", time);
cudaMemcpy2D (b, n*sizeof(float), gpu_b, pitch, n*sizeof(float), n,
cudaMemcpyDeviceToHost);
if (err != 0) {
printf ("%s\n", cudaGetErrorString (err));
exit (1);
}
cudaFree (gpu_a);
cudaFree (gpu_b);
}
Página 4|6
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define STOP 0
#define START 1
#define N 3000
/* Size of square matrix */
void gpu_convol (float *, float *, int);
/* (over-)Simple chronometer function */
void chrono (int kind, float *time) {
static clock_t counts;
if (kind == START) {
*time = 0.0;
counts = clock();
return;
}
if (kind == STOP) {
*time = ((float)(clock()-counts))/((float)CLOCKS_PER_SEC);
}
}
void init (float *m, int n) {
int i;
for (i = 0; i < n*n; i++)
m[i] = drand48();
}
void compare (float *a, float *b, int n) {
int i, errors=0;
for (i = 0; i < n*n; i++) {
if (fabsf(a[i]-b[i]) > 1e-7) { //Play with this value to get an idea of the
accuracy of your platform
errors++;
}
}
if (errors==0)
printf ("PASSED\n");
else
printf ("FAILED: %d errors\n", errors);
}
void cpu_convol (float *a, float *b, int n) {
int i, j, l;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
l = i*n+j;
/* 1D index */
if ((i == 0) || (j == 0) || (i == n-1) || (j == n-1))
b[l] = a[l];
else {
/* Make sure you understand the indices below */
Página 5|6
b[l]=(1./5.)*(
a[l-1]
a[l-n]+ \
+a[l] +a[l+1]+\
a[l+n]
);
}
}
}
}
int main () {
float *a, *b, *c, time;
int n = N;
a = malloc (n*n*sizeof(float));
b = malloc (n*n*sizeof(float));
c = malloc (n*n*sizeof(float));
init (a, n);
gpu_convol (a, c, n);
chrono (START, &time);
cpu_convol (a, b, n);
chrono (STOP, &time);
printf ("Convolution took %f sec. on CPU\n", time);
compare (b, c, n);
free (a);
free (b);
free (c);
}
Página 6|6
Descargar