#include <stdio.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#include <device_functions.h>
#include <sm_11_atomic_functions.h>

typedef unsigned char uchar;
typedef unsigned int  uint;
#define HISTOGRAM_BIN_COUNT 256
#define N 1024
__global__ void histogram3(uint* histogram, uchar* color, int size)
{
    __shared__ uint data[HISTOGRAM_BIN_COUNT];

    // I ni t i a l i z a t i o n
    int stride = blockDim.x;
    for (int i = threadIdx.x; i < HISTOGRAM_BIN_COUNT; i += stride)
        data[i] = 0;
    __syncthreads();

    // C a l c u l a t e   p r i v a t e   h i s t o g r a m
    stride = blockDim.x * gridDim.x;
    for (uint i = threadIdx.x + blockDim.x * blockIdx.x; 
        i < size; i += stride)
        atomicAdd( &data[color[i]], 1);
    __syncthreads();

    // U p d a t e   g l o b a l   h i s t o g r a m
    stride = blockDim.x;
    for (uint i = threadIdx.x; i < HISTOGRAM_BIN_COUNT; i += stride)
        atomicAdd( &(histogram[i]), data[i] );
}

int main() {
    uchar* hColor = (uchar*)malloc(N * sizeof(uchar));
    uint* hHistogram3 = (uint*)malloc(HISTOGRAM_BIN_COUNT * sizeof(uint));
    dim3 block, grid;
    uchar* dColor;
    uint* dHistogram;
    cudaMalloc(&dHistogram, HISTOGRAM_BIN_COUNT * sizeof(uint));
    cudaMalloc(&dColor, N * sizeof(uchar));
    srand(2017);
    for (uint i = 0; i < N; ++i) hColor[i] = (uchar)(rand() % 256);
    cudaMemcpy(dColor, hColor, N * sizeof(uchar), cudaMemcpyHostToDevice);
    cudaMemset(dHistogram, 0, HISTOGRAM_BIN_COUNT * sizeof(uint));
    block.x = 64;
    grid.x = (N + block.x - 1) / block.x;
    histogram3<<<grid,block>>>(dHistogram, dColor, N);
    cudaMemcpy(hHistogram3, dHistogram, 
        HISTOGRAM_BIN_COUNT * sizeof(uint), cudaMemcpyDeviceToHost);
    for (int i = 0; i < HISTOGRAM_BIN_COUNT; ++i)
        printf("%d ", hHistogram3[i]); printf("\n");
}
