From e4f18f074a14115815317482582a8d68ecd1f288 Mon Sep 17 00:00:00 2001
From: Reynaldo Morillo <reynaldo.morillo@uconn.edu>
Date: Fri, 28 Apr 2017 20:31:16 -0400
Subject: [PATCH] Added computeW, general_normal functions. mergePosterior
 first draft is done, and ready to be tested.

---
 parallel/para_gibbs.cu | 68 ++++++++++++++++++++++++++++--------------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu
index f344987..f6407f2 100644
--- a/parallel/para_gibbs.cu
+++ b/parallel/para_gibbs.cu
@@ -87,20 +87,22 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
 __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n,
                              float a, float b, int K, curandState *state);
 
-__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads);
+__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results);
 
 __global__ void sampleTdot(int trials, int *tDot,curandState *state);
 
 __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads);
 
-__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h);
+__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h);
 
 __device__ float normPDF(float x, float mean, float sigma);
 
+__device__ float general_normoral(curandState *state, float mean, float variance);
+
 int main(int argc, char **argv){
 
   curandState *devStates;
-  float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out;
+  float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out, *results;
   int  K, *y, *dev_y, nBlocks,nThreads, trials = 1000;
 
   if(argc>4){
@@ -137,8 +139,9 @@ int main(int argc, char **argv){
   /* Allocate space for random states on device */
   CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState)));
   /////////////////////////allocate space on dev memory to save a and b
-  CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float)));
-  CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float)));
+  CUDA_CALL(cudaMalloc((void **)&dev_a_out, nBlocks*nThreads*trials*sizeof(float)));
+  CUDA_CALL(cudaMalloc((void **)&dev_b_out, nBlocks*nThreads*trials*sizeof(float)));
+  CUDA_CALL(cudaMalloc((void **)&results, nThreads*sizeof(float)));
 
   /*------ Setup random number generators (one per thread) ---------*/
 
@@ -161,8 +164,12 @@ int main(int argc, char **argv){
 
   int numThreads = trials / nBlocks;  // TODO: This should be a whole number
   mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads);
+
+
  /*------ Free Memory -------------------------------------------*/
 
+
+
   free(y);
   free(n);
 
@@ -183,7 +190,7 @@ __global__ void sampleTdot(int trials, int *tDot,curandState *state){
 }
 
 
-__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads){
+__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results){
 
  // printf("\n all blocks finished\n");
  //   for(int j = 0; j < nBlocks*2 ; j++) {
@@ -197,17 +204,31 @@ __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int
   int id = threadIdx.x + blockIdx.x * blockDim.x;
   float h = id^(-1/(4+2));
 
-  for (int i=0; i < M; i++) {
-    int *cDot;
-    CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice));
-    int cm = (curand(&state[id]) % (trials-1)) + 1;
-    int u = curand_uniform(&state[id]);
-    // TODO: Compute wcDot
-    // TODO: Compute WtDot
-    if ()
+  for (int i=0; i < nThreads; i++) {
+    for (int m=0; m < M; m++) {
+      int *cDot;
+      CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice));
+      cDot[m] = (curand(&state[id]) % (trials-1)) + 1;
+      int u = curand_uniform(&state[id]);
+      float wcDot = computeW(cDot, dev_a_out, dev_b_out, M, nThreads, h[i]);
+      float wtDot = computeW(tDot, dev_a_out, dev_b_out, M, nThreads, h[i]);
+      if (u < (wcDot/ wtDot) )
+        CUDA_CALL(cudaMemcpy(tDot, cDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice));
+    }
+    // TODO: Draw from Multivariate Normal, and Save into the results
+    float posterior_mean_a = posteriorMean(tDot, dev_a_out, M, nThreads);
+    float posterior_mean_b = posteriorMean(tDot, dev_b_out, M, nThreads);
+    float variance = (h[i]^2)/M;
+
+    printf('%f %f' , norm(state, posterior_mean_a, variance), norm(state, posterior_mean_b, variance));
   }
 }
 
+
+__device__ float general_normoral(curandState *state, float mean, float variance) {
+    return curand_normal(&state[id]) * variance + mean;
+}
+
 __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) {
   float sum = 0;
   for (int i=0; i < M; i++) {
@@ -218,22 +239,25 @@ __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads)
   return mean;
 }
 
-__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h) {
-  float posterior_mean = posteriorMean(tDot, dev_x_out, M, nThreads);
+__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h) {
+  float posterior_mean_x = posteriorMean(tDot, dev_x_out, M, nThreads);
+  float posterior_mean_y = posteriorMean(tDot, dev_y_out, M, nThreads);
   float product = 1;
-  float posterior_m_tm;
+  float posterior_m_tm_x;
+  float posterior_m_tm_y;
 
   for (int i=0; i < M; i++) {
     int index =  tDot[i] + i * nThreads;  // trial m of posterior m (note: i = blockId)
-    posterior_m_tm = dev_x_out[index];  // posterior_m_tm
-    product *= normPDF(posterior_m_tm, posterior_mean, h^2);
+    posterior_m_tm_x = dev_x_out[index];  // posterior_m_tm of x
+    posterior_m_tm_y = dev_y_out[index];  // posterio_m_tm_ of y
+    product *= normPDF(posterior_m_tm_x, posterior_mean_x, h^2) * normPDF(posterior_m_tm_y, posterior_mean_y, h^2);
   }
   return product;
 }
 
-__device__ float normPDF(float x, float mean, float sigma) {
-  float denominator = sqrtf(2*PI*(sigma^2));
-  float numerator = expf( -1 * (x-mean)^2 / (2*sigma^2) );
+__device__ float normPDF(float x, float mean, float variance) {
+  float denominator = sqrtf(2*PI*(variance^2));
+  float numerator = expf( -1 * (x-mean)^2 / (2*variance^2) );
   return numerator/denominator;
 }