diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu index f174a02..c7f08bc 100644 --- a/parallel/para_gibbs.cu +++ b/parallel/para_gibbs.cu @@ -82,16 +82,17 @@ __global__ void setup_kernel(curandState *state, unsigned int seed, int); __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, float *theta, float *log_theta, - float a, float b, int trials); + float a, float b,float *dev_a_out,float *dev_b_out, + int trials); __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, float a, float b, int K, curandState *state); -__global__ void mergePosterior(); +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out); int main(int argc, char **argv){ curandState *devStates; - float a, b, *n, *dev_n, *dev_theta, *dev_log_theta; + float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out; int K, *y, *dev_y, nBlocks,nThreads, trials = 1000; if(argc>4){ @@ -126,20 +127,23 @@ int main(int argc, char **argv){ /* Allocate space for random states on device */ CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); - + /////////////////////////allocate space on dev memory to save a and b + CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float))); + CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float))); /*------ Setup random number generators (one per thread) ---------*/ //nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; //nBlocks = 1; setup_kernel<<>>(devStates, 0, K); - rnorm(); - + +//printf("%f",&dev_a_out); printf("alpha, beta\n"); //seqMetroProcess<<>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); - seqMetroProcess<<>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); + seqMetroProcess<<>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials); + - //mergePosterior<<>>(); + mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out); /*------ Free Memory -------------------------------------------*/ free(y); @@ -154,8 +158,16 @@ int main(int argc, char **argv){ return EXIT_SUCCESS; } -__global__ void mergePosterior(){ - printf("\n all blocks finished\n"); +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out){ +/* printf("\n all blocks finished\n"); + for(int j = 0; j < trials ; j++) { + printf(" %f ", *dev_a_out); + printf(" %f \n",*dev_b_out); + dev_a_out++; + dev_b_out++; + } +*/ + } @@ -179,7 +191,7 @@ __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float * __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, float *theta, float *log_theta, - float a, float b, int trials){ + float a, float b,float *dev_a_out,float *dev_b_out, int trials){ /*------ MCMC ----------------------------------------------------*/ int i; //printf("K: %d \n ",K); @@ -213,9 +225,15 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat /* Steps of MCMC */ - + int rstart = id*trials/nTasks; + int rlenPerTask = trials; + float *subA; + subA = dev_a_out+rstart; + float *subB; + subB = dev_b_out+rstart; - for(i = 0; i < trials; i++){ + + for(i = 0; i < rlenPerTask; i++){ //sample_theta<<>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K); sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state); @@ -236,11 +254,17 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat //printf("sum of logtheta is %f\n",sum_logs); /* Sample hyperparameters. */ - a = sample_a(state,threadIdx.x, a, b, lengthPerBlock, sum_logs); - b = sample_b(state,threadIdx.x, a, lengthPerBlock, flat_sum); + a = sample_a(state,id, a, b, lengthPerBlock, sum_logs); + b = sample_b(state,id, a, lengthPerBlock, flat_sum); /* print hyperparameters. */ - printf("%f, %f\n", a, b); + //printf("%f, %f\n", a, b); + ////////////save new output to the global array + *subA = a; + subA++; + *subB = b; + subB++; + } }