diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu index ac50c70..d8d04d5 100644 --- a/parallel/para_gibbs.cu +++ b/parallel/para_gibbs.cu @@ -87,7 +87,9 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, float a, float b, int K, curandState *state); -__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot); +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks); + +__global__ void sampleTdot(int trials, int *tDot,curandState *state); int main(int argc, char **argv){ @@ -100,10 +102,11 @@ int main(int argc, char **argv){ nBlocks = atoi(argv[3]); nThreads = atoi(argv[4]); } - else if(argc > 2) + else if(argc > 2){ trials = atoi(argv[2]); nBlocks = 64; nThreads = 1; + } load_data(argc, argv, &K, &y, &n); @@ -143,7 +146,15 @@ int main(int argc, char **argv){ seqMetroProcess<<>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials); int *tDot; - mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot); + CUDA_CALL(cudaMalloc((void **)&tDot,trials*sizeof(int))); + + float *h; + CUDA_CALL(cudaMalloc((void **)&h,trials*sizeof(float))); + + sampleTdot<<>>(trials, tDot,devStates); + + + mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates,nBlocks); /*------ Free Memory -------------------------------------------*/ free(y); @@ -158,7 +169,16 @@ int main(int argc, char **argv){ return EXIT_SUCCESS; } -__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot){ +__global__ void sampleTdot(int trials, int *tDot,curandState *state){ + int id = threadIdx.x + blockIdx.x * blockDim.x; + int u = (curand(&state[id]) % (trials-1)) + 1; + //printf("thread %d sample: %d",id,u); + tDot[id] = u; +} + + +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot, curandState *state,int nBlocks){ + /* printf("\n all blocks finished\n"); for(int j = 0; j < trials ; j++) { printf(" %f ", *dev_a_out); @@ -168,6 +188,15 @@ __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int } */ + printf("\n all blocks finished\n"); + for(int j = 0; j < nBlocks*2 ; j++) { + // printf(" %f ", *dev_a_out); + printf(" %d \n",*tDot); + tDot++; + //dev_b_out++; + } + + }