#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <curand.h>

#include <cuda_runtime.h>
//#include <device_functions.h>

#if ( __CUDACC_VER_MAJOR__ >=7 )
    #include <helper_cuda.h>
    #include <helper_functions.h>
//    #define cutilCheckError(call) checkCudaErrors(call)
    #define cudaStopWatchInterface StopWatchInterface *
    #define cutilCheckError(call) call
    #define cutilSafeCall(call) call
    #define cutCreateTimer(x) sdkCreateTimer(x)
    #define cutResetTimer(x) sdkResetTimer(&x)
    #define cutStartTimer(x) sdkStartTimer(&x)
    #define cutStopTimer(x) sdkStopTimer(&x)
    #define cutGetTimerValue(x) sdkGetTimerValue(&x)
    #define cutilDeviceSynchronize cudaDeviceSynchronize
    #define cudaThreadExit  cudaDeviceReset
    #define cutiliTESTSafeCall(call) \
    do { \
        cudaError_t err = call; \
        if (cudaSuccess != err) { \
           fprintf (stderr, "Cuda error in file '%s' in line %i : %s.",  \
                 __FILE__, __LINE__, cudaGetErrorString(err) ); \
           exit(EXIT_FAILURE); \
       } \
    } while (0)
#else
    #include <cutil_inline.h>
    #include <sm_11_atomic_functions.h>
    #define cudaStopWatchInterface uint
#endif
typedef unsigned char uchar;
typedef unsigned int  uint;

extern "C"
unsigned int glMaxCPU(unsigned int *x, unsigned int N);
#include "glMax_kernel.cu"
#include "glMax_gold.cpp"

int main(int argc, char *argv[])
{
	unsigned int *h_x, *d_x, *d_glmx;
	curandGenerator_t gen;
	int N = 10000, BLOCK_DIM = 128, GRID_SZ, DATA_SZ;
	unsigned int glmxCPU;
	unsigned int glmx0, glmx1;

        if (argc<3)
        {
        	printf("Usage: %s <N> <BLOCK_DIM> \n",argv[0]);
 		exit(0);
        }   
        else
        {
        	N = atoi(argv[1]);
        	BLOCK_DIM = atoi(argv[2]);
	}

	GRID_SZ = ceil(float(N)/float(BLOCK_DIM));
	DATA_SZ = N * sizeof(float);
	cudaStopWatchInterface hTimer = 0;
	
        // Memory allocation on CPU
	h_x     = (unsigned int *)malloc(DATA_SZ);

    	// Memory allocation on GPU
    	cutilSafeCall( cudaMalloc((void **)&d_x, DATA_SZ)   );
    	cutilSafeCall( cudaMalloc((void **)&d_glmx, sizeof(unsigned int)));

	// Create pseudo-random number generator
	curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
	// Set seed
	curandSetPseudoRandomGeneratorSeed(gen, 1234ULL);

	// Generate x and y on device 
	curandGenerate(gen, d_x, N);

	// Copy data from device to host 
	cutilSafeCall( cudaMemcpy(h_x, d_x, DATA_SZ, cudaMemcpyDeviceToHost) );
/* DEBUG
for (int i=0; i<N; ++i) printf("%u ", h_x[i]);
printf("\n");
*/

	cutilCheckError( cutCreateTimer(&hTimer) );

	// Launch Kernel 0
    	cutilSafeCall( cudaMemset(d_glmx, 0, sizeof(unsigned int)) );
	cutilCheckError( cutResetTimer(hTimer) );
	cutilCheckError( cutStartTimer(hTimer) );
	glMaxGPU0<<<GRID_SZ, BLOCK_DIM, BLOCK_DIM*sizeof(unsigned int)>>>(d_x, N, d_glmx);
	cutilCheckError( cutStopTimer(hTimer) );
	float timer0 = cutGetTimerValue(hTimer);
	cutilSafeCall( cudaMemcpy(&glmx0, d_glmx, sizeof(unsigned int), cudaMemcpyDeviceToHost) );

	// Launch Kernel 1
    	cutilSafeCall( cudaMemset(d_glmx, 0, sizeof(unsigned int) ));
	cutilCheckError( cutResetTimer(hTimer) );
	cutilCheckError( cutStartTimer(hTimer) );
	glMaxGPU1<<<GRID_SZ, BLOCK_DIM, BLOCK_DIM*sizeof(unsigned int)>>>(d_x, N, BLOCK_DIM, d_glmx);
	cutilCheckError( cutStopTimer(hTimer) );
	float timer1 = cutGetTimerValue(hTimer);
	cutilSafeCall( cudaMemcpy(&glmx1, d_glmx, sizeof(unsigned int), cudaMemcpyDeviceToHost) );

	// print info msgs
	printf("GRID_SZ= %d BLOCK_DIM= %d\n", GRID_SZ, BLOCK_DIM);
	printf("Kernel-0 GPU time: %f msecs.\n", timer0);
	printf("Kernel-0 GPU Result = %u\n",glmx0);
	printf("Kernel-1 GPU time: %f msecs.\n", timer1);
	printf("Kernel-1 GPU Result = %u\n",glmx1);

	// Launch on CPU
	cutilCheckError( cutResetTimer(hTimer) );
	cutilCheckError( cutStartTimer(hTimer) );
	glmxCPU = glMaxCPU(h_x, N);
	cutilCheckError( cutStopTimer(hTimer) );
	float timerCPU = cutGetTimerValue(hTimer);

	// print info msgs
	printf("CPU time: %f msecs.\n", timerCPU);
	printf("CPU Result = %u\n",glmxCPU);
	printf("GPU/CPU speedup (kernel0) = %f\n", timerCPU/timer0);
	printf("GPU/CPU speedup (kernel1) = %f\n", timerCPU/timer1);

	// Cleanup 
	curandDestroyGenerator(gen);
	cutilSafeCall( cudaFree(d_x) );
	free(h_x);
	cudaThreadExit();
}
