Analysis and Design of Parallel Algorithms M. en C. Sandra Luz Morales Güitrón. Proyecto para Extraordinario Nombre del alumno(a):_____________________________________________ Página 1|6 o o o o o Página 2|6 #include <time.h> #include <cuda.h> #include <stdio.h> #define STOP 0 #define START 1 #define BLOCK_X 16 #define BLOCK_Y 16 extern "C" void chrono (int kind, float *time); __global__ void kconvol (float *gpu_a, float *gpu_b, int pitch, int n) { int ig, jg, lg, il, jl, ll; __shared__ float la[(BLOCK_X+2)*(BLOCK_Y+2)]; __shared__ float lb[(BLOCK_X+2)*(BLOCK_Y+2)]; // A thread now has two sets of coordinates : // (ig, jg) in the global array // (il, jl) in the local array (shared) of size (BLOCK_X+2)*(BLOCK_Y+2) ig = blockDim.x*blockIdx.x+threadIdx.x; jg = blockDim.y*blockIdx.y+threadIdx.y; lg = ig+jg*pitch; // UP TO YOU : write below the indices il and jl il = ?... jl = ?... ll = il+jl*(BLOCK_X+2); // What does the following line correspond to ? la[ll] = gpu_a[lg]; if ((il == 1) && (ig > 0)) // What does the following line correespond to ? la[ll-1] = gpu_a[lg-1]; if ((jl == 1) && (jg > 0)) la[ll-BLOCK_X-2] = gpu_a[lg-pitch]; if ((il == BLOCK_X) && (ig < n-1)) // UP TO YOU The following line is missing. Find out what was intended //??...............?? if ((jl == BLOCK_Y) && (jg < n-1)) // UP TO YOU Find out the missing offset of local array below la[ll+/*MISSING*/] = gpu_a[lg+pitch]; __syncthreads (); if ((ig >= n) || (jg >= n)) return; if ((ig == 0) || (jg == 0) || (ig == n-1) || (jg == n-1)) { lb[ll] = la[ll]; } else /* UP TO YOU : fill up below the missing indices */ lb[ll]=(1.f/5.f)*( +la[/* MISSING */]+ \ la[ll-1] +la[ll] +la[ll+1]+ \ +la[/* MISSING */]); gpu_b[lg] = lb[ll]; } extern "C" void gpu_convol (float *a, float *b, int n) { float *gpu_a; float *gpu_b; Página 3|6 cudaError_t err; size_t pitch; float time; err = cudaMallocPitch (&gpu_a, &pitch, n*sizeof(float), n); if (err != 0) { printf ("Error allocating gpu_a: %s\n", cudaGetErrorString (err)); exit (1); } err = cudaMallocPitch (&gpu_b, &pitch, n*sizeof(float), n); if (err != 0) { printf ("Error allocating gpu_b: %s\n", cudaGetErrorString (err)); exit (1); } dim3 block (BLOCK_X, BLOCK_Y); dim3 grid; grid.x = (n-1)/BLOCK_X+1; grid.y = (n-1)/BLOCK_Y+1; cudaMemcpy2D (gpu_a, pitch, a, n*sizeof(float), n*sizeof(float), n, cudaMemcpyHostToDevice); chrono (START, &time); kconvol <<<grid, block>>> (gpu_a, gpu_b, pitch/sizeof(float), n); err=cudaThreadSynchronize (); chrono (STOP, &time); printf ("Convolution took %f sec. on GPU\n", time); cudaMemcpy2D (b, n*sizeof(float), gpu_b, pitch, n*sizeof(float), n, cudaMemcpyDeviceToHost); if (err != 0) { printf ("%s\n", cudaGetErrorString (err)); exit (1); } cudaFree (gpu_a); cudaFree (gpu_b); } Página 4|6 #include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> #define STOP 0 #define START 1 #define N 3000 /* Size of square matrix */ void gpu_convol (float *, float *, int); /* (over-)Simple chronometer function */ void chrono (int kind, float *time) { static clock_t counts; if (kind == START) { *time = 0.0; counts = clock(); return; } if (kind == STOP) { *time = ((float)(clock()-counts))/((float)CLOCKS_PER_SEC); } } void init (float *m, int n) { int i; for (i = 0; i < n*n; i++) m[i] = drand48(); } void compare (float *a, float *b, int n) { int i, errors=0; for (i = 0; i < n*n; i++) { if (fabsf(a[i]-b[i]) > 1e-7) { //Play with this value to get an idea of the accuracy of your platform errors++; } } if (errors==0) printf ("PASSED\n"); else printf ("FAILED: %d errors\n", errors); } void cpu_convol (float *a, float *b, int n) { int i, j, l; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { l = i*n+j; /* 1D index */ if ((i == 0) || (j == 0) || (i == n-1) || (j == n-1)) b[l] = a[l]; else { /* Make sure you understand the indices below */ Página 5|6 b[l]=(1./5.)*( a[l-1] a[l-n]+ \ +a[l] +a[l+1]+\ a[l+n] ); } } } } int main () { float *a, *b, *c, time; int n = N; a = malloc (n*n*sizeof(float)); b = malloc (n*n*sizeof(float)); c = malloc (n*n*sizeof(float)); init (a, n); gpu_convol (a, c, n); chrono (START, &time); cpu_convol (a, b, n); chrono (STOP, &time); printf ("Convolution took %f sec. on CPU\n", time); compare (b, c, n); free (a); free (b); free (c); } Página 6|6