From e4f18f074a14115815317482582a8d68ecd1f288 Mon Sep 17 00:00:00 2001 From: Reynaldo Morillo Date: Fri, 28 Apr 2017 20:31:16 -0400 Subject: [PATCH] Added computeW, general_normal functions. mergePosterior first draft is done, and ready to be tested. --- parallel/para_gibbs.cu | 68 ++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu index f344987..f6407f2 100644 --- a/parallel/para_gibbs.cu +++ b/parallel/para_gibbs.cu @@ -87,20 +87,22 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, float a, float b, int K, curandState *state); -__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads); +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results); __global__ void sampleTdot(int trials, int *tDot,curandState *state); __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads); -__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h); +__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h); __device__ float normPDF(float x, float mean, float sigma); +__device__ float general_normoral(curandState *state, float mean, float variance); + int main(int argc, char **argv){ curandState *devStates; - float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out; + float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out, *results; int K, *y, *dev_y, nBlocks,nThreads, trials = 1000; if(argc>4){ @@ -137,8 +139,9 @@ int main(int argc, char **argv){ /* Allocate space for random states on device */ CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); /////////////////////////allocate space on dev memory to save a and b - CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float))); - CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float))); + CUDA_CALL(cudaMalloc((void **)&dev_a_out, nBlocks*nThreads*trials*sizeof(float))); + CUDA_CALL(cudaMalloc((void **)&dev_b_out, nBlocks*nThreads*trials*sizeof(float))); + CUDA_CALL(cudaMalloc((void **)&results, nThreads*sizeof(float))); /*------ Setup random number generators (one per thread) ---------*/ @@ -161,8 +164,12 @@ int main(int argc, char **argv){ int numThreads = trials / nBlocks; // TODO: This should be a whole number mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads); + + /*------ Free Memory -------------------------------------------*/ + + free(y); free(n); @@ -183,7 +190,7 @@ __global__ void sampleTdot(int trials, int *tDot,curandState *state){ } -__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads){ +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results){ // printf("\n all blocks finished\n"); // for(int j = 0; j < nBlocks*2 ; j++) { @@ -197,17 +204,31 @@ __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int int id = threadIdx.x + blockIdx.x * blockDim.x; float h = id^(-1/(4+2)); - for (int i=0; i < M; i++) { - int *cDot; - CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); - int cm = (curand(&state[id]) % (trials-1)) + 1; - int u = curand_uniform(&state[id]); - // TODO: Compute wcDot - // TODO: Compute WtDot - if () + for (int i=0; i < nThreads; i++) { + for (int m=0; m < M; m++) { + int *cDot; + CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); + cDot[m] = (curand(&state[id]) % (trials-1)) + 1; + int u = curand_uniform(&state[id]); + float wcDot = computeW(cDot, dev_a_out, dev_b_out, M, nThreads, h[i]); + float wtDot = computeW(tDot, dev_a_out, dev_b_out, M, nThreads, h[i]); + if (u < (wcDot/ wtDot) ) + CUDA_CALL(cudaMemcpy(tDot, cDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); + } + // TODO: Draw from Multivariate Normal, and Save into the results + float posterior_mean_a = posteriorMean(tDot, dev_a_out, M, nThreads); + float posterior_mean_b = posteriorMean(tDot, dev_b_out, M, nThreads); + float variance = (h[i]^2)/M; + + printf('%f %f' , norm(state, posterior_mean_a, variance), norm(state, posterior_mean_b, variance)); } } + +__device__ float general_normoral(curandState *state, float mean, float variance) { + return curand_normal(&state[id]) * variance + mean; +} + __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) { float sum = 0; for (int i=0; i < M; i++) { @@ -218,22 +239,25 @@ __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) return mean; } -__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h) { - float posterior_mean = posteriorMean(tDot, dev_x_out, M, nThreads); +__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h) { + float posterior_mean_x = posteriorMean(tDot, dev_x_out, M, nThreads); + float posterior_mean_y = posteriorMean(tDot, dev_y_out, M, nThreads); float product = 1; - float posterior_m_tm; + float posterior_m_tm_x; + float posterior_m_tm_y; for (int i=0; i < M; i++) { int index = tDot[i] + i * nThreads; // trial m of posterior m (note: i = blockId) - posterior_m_tm = dev_x_out[index]; // posterior_m_tm - product *= normPDF(posterior_m_tm, posterior_mean, h^2); + posterior_m_tm_x = dev_x_out[index]; // posterior_m_tm of x + posterior_m_tm_y = dev_y_out[index]; // posterio_m_tm_ of y + product *= normPDF(posterior_m_tm_x, posterior_mean_x, h^2) * normPDF(posterior_m_tm_y, posterior_mean_y, h^2); } return product; } -__device__ float normPDF(float x, float mean, float sigma) { - float denominator = sqrtf(2*PI*(sigma^2)); - float numerator = expf( -1 * (x-mean)^2 / (2*sigma^2) ); +__device__ float normPDF(float x, float mean, float variance) { + float denominator = sqrtf(2*PI*(variance^2)); + float numerator = expf( -1 * (x-mean)^2 / (2*variance^2) ); return numerator/denominator; }