/* File: dot5.cu * Purpose: Implement dot product on a gpu using cuda. This version * uses a binary tree reduction in which we attempt to reduce * thread divergence. It uses shared memory to store * intermediate results and page-locked mapped memory to avoid * a copy back to the host. It also unwraps the loop in Dev_dot. * Assumes both threads_per_block and blocks_per_grid are powers * of 2. * * Compile: * Linux: nvcc -arch=sm_12 -o dot5 dot5.cu * -I /usr/local/NVIDIA_GPU_Computing_SDK/shared/inc * -I /usr/local/NVIDIA_GPU_Computing_SDK/C/common/inc * MacOS: nvcc -arch=sm_13 -o dot5 dot5.cu * -I /Developer/GPU\ Computing/shared/inc * -I /Developer/GPU\ Computing/C/common/inc * (This is for hrn53603 and hrn53604) * MacOS: nvcc -arch=sm_11 -o dot5 dot5.cu * -I /Developer/GPU\ Computing/shared/inc * -I /Developer/GPU\ Computing/C/common/inc * (This is for hrn53601 and hrn53602) * Run: ./dot5 * n is the vector length * * Input: None * Output: Result of dot product of a collection of random floats * * Notes: * 1. Use DEBUG compile flag for verbose output */ #include #include #include "cuPrintf.cu" #include #include "cutil_inline.h" #include "timer.h" #define MAX_BLOCK_SZ 512 /*------------------------------------------------------------------- * Function: Dev_dot (kernel) * Purpose: Implement a dot product of floating point vectors * using atomic operations for the global sum * In args: x, y, n * Out arg: z * */ __global__ void Dev_dot(float x[], float y[], float z[], int n) { /* Use tmp to store products of vector components in each block */ /* Can't use variable dimension here */ __shared__ float tmp[MAX_BLOCK_SZ]; int t = blockDim.x * blockIdx.x + threadIdx.x; int loc_t = threadIdx.x; int block_sz = blockDim.x; # ifdef DEBUG cuPrintf("t = %d, blockdim = %d, blockidx = %d, threadidx = %d\n", t, blockDim.x, blockIdx.x, threadIdx.x); # endif if (t < n) tmp[loc_t] = x[t]*y[t]; __syncthreads(); # ifdef DEBUG cuPrintf("After product tmp[%d] = %.1f\n", t, tmp[loc_t]); # endif if (block_sz >= 512) { if (loc_t < 256) tmp[loc_t] += tmp[loc_t + 256]; __syncthreads(); } if (block_sz >= 256) { if (loc_t < 128) tmp[loc_t] += tmp[loc_t + 128]; __syncthreads(); } if (block_sz >= 128) { if (loc_t < 64) tmp[loc_t] += tmp[loc_t + 64]; __syncthreads(); } /* From here on, only threads in the first warp are running */ /* So there's no more need for calls to __syncthreads */ if (loc_t < 32) { if (block_sz >= 64) tmp[loc_t] += tmp[loc_t + 32]; if (block_sz >= 32) tmp[loc_t] += tmp[loc_t + 16]; if (block_sz >= 16) tmp[loc_t] += tmp[loc_t + 8]; if (block_sz >= 8) tmp[loc_t] += tmp[loc_t + 4]; if (block_sz >= 4) tmp[loc_t] += tmp[loc_t + 2]; if (block_sz >= 2) tmp[loc_t] += tmp[loc_t + 1]; } # ifdef DEBUG cuPrintf("After tree tmp[%d] = %.1f\n", t, tmp[loc_t]); # endif /* Store the result from this cache block in z[blockIdx.x] */ if (threadIdx.x == 0) { z[blockIdx.x] = tmp[0]; } } /* Dev_dot */ /*------------------------------------------------------------------- * Host code */ void Get_args(int argc, char* argv[], int* n_p, int* threads_per_block_p, int* blocks_per_grid_p); void Setup(int n, int blocks, float** x_h_p, float** y_h_p, float** z_h_p, float** x_d_p, float** y_d_p, float** z_d_p); float Serial_dot(float x[], float y[], int n); void Free_mem(float* x_h, float* y_h, float* z_h); float Dot_wrapper(float x_d[], float y_d[], float z_d[], float z_h[], int n, int blocks, int threads); /*------------------------------------------------------------------- * main */ int main(int argc, char* argv[]) { int n, threads_per_block, blocks_per_grid; float *x_h, *y_h, *z_h, dot = 0; float *x_d, *y_d, *z_d; double start, finish; /* Only used on host */ cudaError_t rv; /* Our systems only have one gpu, hence device 0 */ cudaSetDevice(0); /* Map host memory for use by device. It looks like this */ /* needs to come before any other cuda calls */ rv = cudaSetDeviceFlags(cudaDeviceMapHost); if (rv != cudaSuccess) { fprintf(stderr, "Call to cudaSetDeviceFlags failed\n"); exit(-1); } # ifdef DEBUG else { fprintf(stderr, "Call to cudaSetDeviceFlags succeeded\n"); } printf("DEBUG is defined\n"); cudaPrintfInit(); # endif Get_args(argc, argv, &n, &threads_per_block, &blocks_per_grid); Setup(n, blocks_per_grid, &x_h, &y_h, &z_h, &x_d, &y_d, &z_d); GET_TIME(start); dot = Dot_wrapper(x_d, y_d, z_d, z_h, n, blocks_per_grid, threads_per_block); GET_TIME(finish); # ifdef DEBUG printf("Printing display . . .\n"); cudaPrintfDisplay(stdout, true); # endif printf("The dot product as computed by cuda is: %e\n", dot); printf("Elapsed time for cuda = %e seconds\n", finish-start); GET_TIME(start) dot = Serial_dot(x_h, y_h, n); GET_TIME(finish); printf("The dot product as computed by cpu is: %e\n", dot); printf("Elapsed time for cpu = %e seconds\n", finish-start); # ifdef DEBUG cudaPrintfEnd(); # endif Free_mem(x_h, y_h, z_h); return 0; } /* main */ /*------------------------------------------------------------------- * Function: Get_args * Purpose: Get and check command line args. If there's an error * quit. */ void Get_args(int argc, char* argv[], int* n_p, int* threads_per_block_p, int* blocks_per_grid_p) { if (argc != 4) { fprintf(stderr, "usage: %s \n", argv[0]); exit(0); } *n_p = strtol(argv[1], NULL, 10); *blocks_per_grid_p = strtol(argv[2], NULL, 10); *threads_per_block_p = strtol(argv[3], NULL, 10); } /* Get_args */ /*------------------------------------------------------------------- * Function: Setup * Purpose: Allocate and initialize host and device memory */ void Setup(int n, int blocks, float** x_h_p, float** y_h_p, float** z_h_p, float** x_d_p, float** y_d_p, float** z_d_p) { int i; cudaError_t rv; size_t size = n*sizeof(float); *x_h_p = (float*) malloc(size); *y_h_p = (float*) malloc(size); /* Allocate vector in host memory */ rv = cudaHostAlloc(z_h_p, blocks*sizeof(float), cudaHostAllocMapped); if (rv != cudaSuccess) { fprintf(stderr, "Call to cudaHostAlloc failed\n"); exit(-1); } # ifdef DEBUG else { fprintf(stderr, "Call to cudaHostAlloc succeeded\n"); printf("z_h = %p\n", *z_h_p); } # endif /* Initialize input vectors */ srandom(1); for (i = 0; i < n; i++) { # ifndef DEBUG (*x_h_p)[i] = random()/((double) RAND_MAX); (*y_h_p)[i] = random()/((double) RAND_MAX); # else (*x_h_p)[i] = i+1; (*y_h_p)[i] = n-i; # endif } # ifdef DEBUG for (i = 0; i < blocks; i++) (*z_h_p)[i] = -i; # endif /* Allocate device memory */ cudaMalloc(x_d_p, size); cudaMalloc(y_d_p, size); /* Get pointer valid on device */ cudaHostGetDevicePointer(z_d_p, *z_h_p, 0); cudaMemcpy(*x_d_p, *x_h_p, size, cudaMemcpyHostToDevice); cudaMemcpy(*y_d_p, *y_h_p, size, cudaMemcpyHostToDevice); } /* Setup */ /*------------------------------------------------------------------- * Function: Dot_wrapper * Purpose: CPU wrapper function for GPU dot product * Note: Assumes x_d, y_d have already been * allocated and initialized on device. Also * assumes z_d has been allocated. */ float Dot_wrapper(float x_d[], float y_d[], float z_d[], float z_h[], int n, int blocks, int threads) { int i; float dot = 0.0; # ifdef DEBUG printf("Before kernel z_h = \n"); for (i = 0; i < blocks; i++) printf("%.1f ", z_h[i]); printf("\n"); # endif /* Invoke kernel */ Dev_dot<<>>(x_d, y_d, z_d, n); cudaThreadSynchronize(); # ifdef DEBUG printf("z_h = "); for (i = 0; i < blocks; i++) printf("%.1f ", z_h[i]); printf("\n"); # endif /* Note that we don't need to copy z_d back to host */ for (i = 0; i < blocks; i++) dot += z_h[i]; return dot; } /* Dot_wrapper */ /*------------------------------------------------------------------- * Function: Serial_dot * Purpose: Compute a dot product on the cpu */ float Serial_dot(float x[], float y[], int n) { int i; float dot = 0; for (i = 0; i < n; i++) dot += x[i]*y[i]; return dot; } /* Serial_dot */ /*------------------------------------------------------------------- * Function: Free_mem * Purpose: Free host and device memory */ void Free_mem(float* x_h, float* y_h, float* z_h) { /* Free host memory */ free(x_h); free(y_h); cudaFreeHost(z_h); } /* Free_mem */