|
@@ -82,16 +82,17 @@ __global__ void setup_kernel(curandState *state, unsigned int seed, int); |
|
|
|
|
|
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, |
|
|
float *theta, float *log_theta, |
|
|
float a, float b, int trials); |
|
|
float a, float b,float *dev_a_out,float *dev_b_out, |
|
|
int trials); |
|
|
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, |
|
|
float a, float b, int K, curandState *state); |
|
|
|
|
|
__global__ void mergePosterior(); |
|
|
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out); |
|
|
|
|
|
int main(int argc, char **argv){ |
|
|
|
|
|
curandState *devStates; |
|
|
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta; |
|
|
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out; |
|
|
int K, *y, *dev_y, nBlocks,nThreads, trials = 1000; |
|
|
|
|
|
if(argc>4){ |
|
@@ -126,20 +127,23 @@ int main(int argc, char **argv){ |
|
|
|
|
|
/* Allocate space for random states on device */ |
|
|
CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); |
|
|
|
|
|
/////////////////////////allocate space on dev memory to save a and b |
|
|
CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float))); |
|
|
CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float))); |
|
|
|
|
|
/*------ Setup random number generators (one per thread) ---------*/ |
|
|
|
|
|
//nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; |
|
|
//nBlocks = 1; |
|
|
setup_kernel<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, 0, K); |
|
|
rnorm(); |
|
|
|
|
|
|
|
|
//printf("%f",&dev_a_out); |
|
|
printf("alpha, beta\n"); |
|
|
//seqMetroProcess<<<nBlocks,1>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); |
|
|
seqMetroProcess<<<nBlocks,nThreads>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); |
|
|
seqMetroProcess<<<nBlocks,nThreads>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials); |
|
|
|
|
|
|
|
|
//mergePosterior<<<nBlocks,1>>>(); |
|
|
mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out); |
|
|
/*------ Free Memory -------------------------------------------*/ |
|
|
|
|
|
free(y); |
|
@@ -154,8 +158,16 @@ int main(int argc, char **argv){ |
|
|
return EXIT_SUCCESS; |
|
|
} |
|
|
|
|
|
__global__ void mergePosterior(){ |
|
|
printf("\n all blocks finished\n"); |
|
|
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out){ |
|
|
/* printf("\n all blocks finished\n"); |
|
|
for(int j = 0; j < trials ; j++) { |
|
|
printf(" %f ", *dev_a_out); |
|
|
printf(" %f \n",*dev_b_out); |
|
|
dev_a_out++; |
|
|
dev_b_out++; |
|
|
} |
|
|
*/ |
|
|
|
|
|
} |
|
|
|
|
|
|
|
@@ -179,7 +191,7 @@ __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float * |
|
|
|
|
|
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, |
|
|
float *theta, float *log_theta, |
|
|
float a, float b, int trials){ |
|
|
float a, float b,float *dev_a_out,float *dev_b_out, int trials){ |
|
|
/*------ MCMC ----------------------------------------------------*/ |
|
|
int i; |
|
|
//printf("K: %d \n ",K); |
|
@@ -213,9 +225,15 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat |
|
|
|
|
|
|
|
|
/* Steps of MCMC */ |
|
|
|
|
|
int rstart = id*trials/nTasks; |
|
|
int rlenPerTask = trials; |
|
|
float *subA; |
|
|
subA = dev_a_out+rstart; |
|
|
float *subB; |
|
|
subB = dev_b_out+rstart; |
|
|
|
|
|
for(i = 0; i < trials; i++){ |
|
|
|
|
|
for(i = 0; i < rlenPerTask; i++){ |
|
|
//sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K); |
|
|
sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state); |
|
|
|
|
@@ -236,11 +254,17 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat |
|
|
//printf("sum of logtheta is %f\n",sum_logs); |
|
|
|
|
|
/* Sample hyperparameters. */ |
|
|
a = sample_a(state,threadIdx.x, a, b, lengthPerBlock, sum_logs); |
|
|
b = sample_b(state,threadIdx.x, a, lengthPerBlock, flat_sum); |
|
|
a = sample_a(state,id, a, b, lengthPerBlock, sum_logs); |
|
|
b = sample_b(state,id, a, lengthPerBlock, flat_sum); |
|
|
|
|
|
/* print hyperparameters. */ |
|
|
printf("%f, %f\n", a, b); |
|
|
//printf("%f, %f\n", a, b); |
|
|
////////////save new output to the global array |
|
|
*subA = a; |
|
|
subA++; |
|
|
*subB = b; |
|
|
subB++; |
|
|
|
|
|
} |
|
|
} |
|
|
|
|
|