2

I want to calculate the sum of all elements of an array in CUDA. I came up with this code. It compiles without any error. But the result is always zero. I've got the invalid device symbol from cudaMemcpyFromSymbol. I cannot use any libraries like Thrust or Cublas.

#define TRIALS_PER_THREAD 4096
#define NUM_BLOCKS 256
#define NUM_THREADS 256
double *dev;
__device__ volatile double pi_gpu = 0;

__global__ void ArraySum(double *array)

{
unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;
pi_gpu = pi_gpu + array[tid];
__syncthreads();
}

int main (int argc, char *argv[]) {
cudaMalloc((void **) &dev, NUM_BLOCKS * NUM_THREADS * sizeof(double));
    double pi_gpu_h;

ArraySum<<<NUM_BLOCKS, NUM_THREADS>>>(dev);
cudaDeviceSynchronize();
cudaError err = cudaMemcpyFromSymbol(&pi_gpu_h, &pi_gpu, sizeof(double), cudaMemcpyDeviceToHost);
if( cudaSuccess != err )
{
    fprintf( stderr, "cudaMemcpyFromSymbolfailed : %s\n", cudaGetErrorString( err ) );
    exit( -1 );
}
return pi_gpu_h; // this is always zero!!!
}
talonmies
  • 70,661
  • 34
  • 192
  • 269
Hamid_UMB
  • 317
  • 4
  • 16

1 Answers1

5

The symbol argument in the copy from symbol call is incorrect. It should look like this:

cudaMemcpyFromSymbol(&pi_gpu_h, pi_gpu, sizeof(double), 0, cudaMemcpyDeviceToHost)
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
talonmies
  • 70,661
  • 34
  • 192
  • 269