/* File: dot0.cu * Purpose: Implement dot product on a gpu using cuda. This version * uses an implementation of atomicAdd taken from the CUDA * C Programming Guide (p. 116) * * Compile: * Linux: nvcc -arch=sm_12 -o dot0 dot0.cu * -I /usr/local/NVIDIA_GPU_Computing_SDK/shared/inc * -I /usr/local/NVIDIA_GPU_Computing_SDK/C/common/inc * MacOS: nvcc -arch=sm_13 -o dot0 dot0.cu * -I /Developer/GPU\ Computing/shared/inc * -I /Developer/GPU\ Computing/C/common/inc * (This is for hrn53603 and hrn53604) * MacOS: nvcc -arch=sm_11 -o dot0 dot0.cu * -I /Developer/GPU\ Computing/shared/inc * -I /Developer/GPU\ Computing/C/common/inc * (This is for hrn53601 and hrn53602) * Run: ./dot0 * n is the vector length * * Input: None * Output: Result of dot product of (1, 2, . . . , n) with * (n, n-1, . . . , 2, 1) and correct solution, which * is n(n+1)(n+2)/6 * */ #include #include #include "cuPrintf.cu" #include #include "cutil_inline.h" /*------------------------------------------------------------------- * Function: atomicAddf (device code) * Purpose: Atomically add the second argument to the first * In arg: val * In/out arg: address */ __device__ float atomicAddf(float* address, float val) { float old = *address, assumed; do { assumed = old; old = __int_as_float( atomicCAS((unsigned int*)address, __float_as_int(assumed), __float_as_int(val + assumed))); } while (assumed != old); return old; } /* atomicAddf */ /*------------------------------------------------------------------- * Function: Dot (kernel) * Purpose: Implement a dot product of floating point vectors * using atomic operations for the global sum * In args: x, y, n * In/out arg: dot_p * * Note: *dot_p should be initialized to 0 by the calling * function */ __global__ void Dot(float x[], float y[], int n, float* dot_p) { float tmp; int i = blockDim.x * blockIdx.x + threadIdx.x; cuPrintf("before dot = %f\n", *dot_p); if (i < n) { tmp = x[i]*y[i]; atomicAddf(dot_p, tmp); } cuPrintf("after dot = %f\n", *dot_p); cuPrintf("tmp = %f\n", tmp); } /* Dot */ /*------------------------------------------------------------------- * Host code */ int main(int argc, char* argv[]) { int n, i; float *x_h, *y_h, dot = 0; float *x_d, *y_d, *dot_d; int threads_per_block; int blocks_per_grid; size_t size; if (argc != 4) { fprintf(stderr, "usage: %s \n", argv[0]); exit(0); } n = strtol(argv[1], NULL, 10); threads_per_block = strtol(argv[2], NULL, 10); blocks_per_grid = strtol(argv[3], NULL, 10); size = n*sizeof(float); cudaPrintfInit(); /* Allocate input vectors in host memory */ x_h = (float*) malloc(size); y_h = (float*) malloc(size); /* Initialize input vectors */ for (i = 0; i < n; i++) { x_h[i] = i+1; y_h[i] = n-i; } /* Allocate vectors in device memory */ cudaMalloc(&x_d, size); cudaMalloc(&y_d, size); cudaMalloc(&dot_d, sizeof(float)); /* Copy vectors from host memory to device memory */ cudaMemcpy(x_d, x_h, size, cudaMemcpyHostToDevice); cudaMemcpy(y_d, y_h, size, cudaMemcpyHostToDevice); cudaMemcpy(dot_d, &dot, sizeof(float), cudaMemcpyHostToDevice); /* Invoke kernel */ Dot<<>>(x_d, y_d, n, dot_d); cudaThreadSynchronize(); cudaPrintfDisplay(stdout, true); cudaMemcpy(&dot, dot_d, sizeof(int), cudaMemcpyDeviceToHost); printf("The dot product as computed by cuda is: %f\n", dot); printf("The dot product as computed by the formula is: %d\n", n*(n+1)*(n+2)/6); cudaPrintfEnd(); /* Free device memory */ cudaFree(x_d); cudaFree(y_d); cudaFree(dot_d); /* Free host memory */ free(x_h); free(y_h); return 0; } /* main */