Skip to content
Permalink
Browse files

Added computeW, general_normal functions. mergePosterior first draft …

…is done, and ready to be tested.
  • Loading branch information
rjm11010 committed Apr 29, 2017
1 parent d270e97 commit e4f18f074a14115815317482582a8d68ecd1f288
Showing with 46 additions and 22 deletions.
  1. +46 −22 parallel/para_gibbs.cu
@@ -87,20 +87,22 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n,
float a, float b, int K, curandState *state); float a, float b, int K, curandState *state);


__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads); __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results);


__global__ void sampleTdot(int trials, int *tDot,curandState *state); __global__ void sampleTdot(int trials, int *tDot,curandState *state);


__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads); __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads);


__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h); __device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h);


__device__ float normPDF(float x, float mean, float sigma); __device__ float normPDF(float x, float mean, float sigma);


__device__ float general_normoral(curandState *state, float mean, float variance);

int main(int argc, char **argv){ int main(int argc, char **argv){


curandState *devStates; curandState *devStates;
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out; float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out, *results;
int K, *y, *dev_y, nBlocks,nThreads, trials = 1000; int K, *y, *dev_y, nBlocks,nThreads, trials = 1000;


if(argc>4){ if(argc>4){
@@ -137,8 +139,9 @@ int main(int argc, char **argv){
/* Allocate space for random states on device */ /* Allocate space for random states on device */
CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState)));
/////////////////////////allocate space on dev memory to save a and b /////////////////////////allocate space on dev memory to save a and b
CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float))); CUDA_CALL(cudaMalloc((void **)&dev_a_out, nBlocks*nThreads*trials*sizeof(float)));
CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float))); CUDA_CALL(cudaMalloc((void **)&dev_b_out, nBlocks*nThreads*trials*sizeof(float)));
CUDA_CALL(cudaMalloc((void **)&results, nThreads*sizeof(float)));


/*------ Setup random number generators (one per thread) ---------*/ /*------ Setup random number generators (one per thread) ---------*/


@@ -161,8 +164,12 @@ int main(int argc, char **argv){


int numThreads = trials / nBlocks; // TODO: This should be a whole number int numThreads = trials / nBlocks; // TODO: This should be a whole number
mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads); mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads);


/*------ Free Memory -------------------------------------------*/ /*------ Free Memory -------------------------------------------*/




free(y); free(y);
free(n); free(n);


@@ -183,7 +190,7 @@ __global__ void sampleTdot(int trials, int *tDot,curandState *state){
} }




__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads){ __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results){


// printf("\n all blocks finished\n"); // printf("\n all blocks finished\n");
// for(int j = 0; j < nBlocks*2 ; j++) { // for(int j = 0; j < nBlocks*2 ; j++) {
@@ -197,17 +204,31 @@ __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int
int id = threadIdx.x + blockIdx.x * blockDim.x; int id = threadIdx.x + blockIdx.x * blockDim.x;
float h = id^(-1/(4+2)); float h = id^(-1/(4+2));


for (int i=0; i < M; i++) { for (int i=0; i < nThreads; i++) {
int *cDot; for (int m=0; m < M; m++) {
CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); int *cDot;
int cm = (curand(&state[id]) % (trials-1)) + 1; CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice));
int u = curand_uniform(&state[id]); cDot[m] = (curand(&state[id]) % (trials-1)) + 1;
// TODO: Compute wcDot int u = curand_uniform(&state[id]);
// TODO: Compute WtDot float wcDot = computeW(cDot, dev_a_out, dev_b_out, M, nThreads, h[i]);
if () float wtDot = computeW(tDot, dev_a_out, dev_b_out, M, nThreads, h[i]);
if (u < (wcDot/ wtDot) )
CUDA_CALL(cudaMemcpy(tDot, cDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice));
}
// TODO: Draw from Multivariate Normal, and Save into the results
float posterior_mean_a = posteriorMean(tDot, dev_a_out, M, nThreads);
float posterior_mean_b = posteriorMean(tDot, dev_b_out, M, nThreads);
float variance = (h[i]^2)/M;

printf('%f %f' , norm(state, posterior_mean_a, variance), norm(state, posterior_mean_b, variance));
} }
} }



__device__ float general_normoral(curandState *state, float mean, float variance) {
return curand_normal(&state[id]) * variance + mean;
}

__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) { __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) {
float sum = 0; float sum = 0;
for (int i=0; i < M; i++) { for (int i=0; i < M; i++) {
@@ -218,22 +239,25 @@ __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads)
return mean; return mean;
} }


__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h) { __device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h) {
float posterior_mean = posteriorMean(tDot, dev_x_out, M, nThreads); float posterior_mean_x = posteriorMean(tDot, dev_x_out, M, nThreads);
float posterior_mean_y = posteriorMean(tDot, dev_y_out, M, nThreads);
float product = 1; float product = 1;
float posterior_m_tm; float posterior_m_tm_x;
float posterior_m_tm_y;


for (int i=0; i < M; i++) { for (int i=0; i < M; i++) {
int index = tDot[i] + i * nThreads; // trial m of posterior m (note: i = blockId) int index = tDot[i] + i * nThreads; // trial m of posterior m (note: i = blockId)
posterior_m_tm = dev_x_out[index]; // posterior_m_tm posterior_m_tm_x = dev_x_out[index]; // posterior_m_tm of x
product *= normPDF(posterior_m_tm, posterior_mean, h^2); posterior_m_tm_y = dev_y_out[index]; // posterio_m_tm_ of y
product *= normPDF(posterior_m_tm_x, posterior_mean_x, h^2) * normPDF(posterior_m_tm_y, posterior_mean_y, h^2);
} }
return product; return product;
} }


__device__ float normPDF(float x, float mean, float sigma) { __device__ float normPDF(float x, float mean, float variance) {
float denominator = sqrtf(2*PI*(sigma^2)); float denominator = sqrtf(2*PI*(variance^2));
float numerator = expf( -1 * (x-mean)^2 / (2*sigma^2) ); float numerator = expf( -1 * (x-mean)^2 / (2*variance^2) );
return numerator/denominator; return numerator/denominator;
} }


0 comments on commit e4f18f0

Please sign in to comment.
You can’t perform that action at this time.