From be06d94a15928d77d660e6bf6667b6fdd2f2dda5 Mon Sep 17 00:00:00 2001
From: Reynaldo Morillo <reynaldo.morillo@uconn.edu>
Date: Mon, 24 Apr 2017 22:34:47 -0400
Subject: [PATCH] Created posteriorMean function.

---
 parallel/para_gibbs.cu | 185 ++++++++++++++++++++++-------------------
 1 file changed, 100 insertions(+), 85 deletions(-)

diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu
index d8d04d5..4b9a7af 100644
--- a/parallel/para_gibbs.cu
+++ b/parallel/para_gibbs.cu
@@ -13,7 +13,7 @@ model:
 y_k     ~ Poisson(n_k * theta_k)     k = 1, ..., K
 theta_k ~ Gamma(a, b)
 a       ~ Unif(0, a0)
-b       ~ Unif(0, b0) 
+b       ~ Unif(0, b0)
 
 We let a0 and b0 be arbitrarily large.
 
@@ -39,7 +39,7 @@ $ nvcc gibbs_metropolis.cu -o gibbs
 $ ./gibbs mydata.txt 2500 > output.csv
 $
 
-This code borrows from the nVidia developer zone documentation, 
+This code borrows from the nVidia developer zone documentation,
 specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1
 */
 
@@ -58,7 +58,7 @@ specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1
 #define CUDA_CALL(x) {if((x) != cudaSuccess){ \
   printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \
   printf("  %s\n", cudaGetErrorString(cudaGetLastError())); \
-  exit(EXIT_FAILURE);}} 
+  exit(EXIT_FAILURE);}}
 
 #define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \
   printf("Error at %s:%d\n",__FILE__,__LINE__); \
@@ -75,22 +75,24 @@ __host__ float rgamma(float a, float b);
 
 __device__ float rgamma(curandState *state, int id, float a, float b);
 
-__global__ void sample_theta(curandState *state, float *theta, float *log_theta, 
+__global__ void sample_theta(curandState *state, float *theta, float *log_theta,
                              int *y, float *n, float a, float b, int K);
 __global__ void setup_kernel(curandState *state, unsigned int seed, int);
 
 
-__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, 
-                                float *theta, float *log_theta, 
+__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state,
+                                float *theta, float *log_theta,
                                 float a, float b,float *dev_a_out,float *dev_b_out,
                                 int trials);
 __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n,
                              float a, float b, int K, curandState *state);
 
-__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks);
+__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads);
 
 __global__ void sampleTdot(int trials, int *tDot,curandState *state);
 
+__device__ float posteriorMean(*tDot, float *dev_x_out, int M, int nThreads);
+
 int main(int argc, char **argv){
 
   curandState *devStates;
@@ -111,17 +113,17 @@ int main(int argc, char **argv){
   load_data(argc, argv, &K, &y, &n);
 
   /* starting values of hyperparameters */
-  a = 20; 
-  b = 1;   
+  a = 20;
+  b = 1;
 
   /*------ Allocate memory -----------------------------------------*/
 
   CUDA_CALL(cudaMalloc((void **)&dev_y, K * sizeof(int)));
-  CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int), 
+  CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int),
             cudaMemcpyHostToDevice));
 
   CUDA_CALL(cudaMalloc((void **)&dev_n, K * sizeof(float)));
-  CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float), 
+  CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float),
             cudaMemcpyHostToDevice));
 
   /* Allocate space for theta and log_theta on device and host */
@@ -139,12 +141,12 @@ int main(int argc, char **argv){
   //nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
   //nBlocks = 1;
   setup_kernel<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, 0, K);
-  
+
 //printf("%f",&dev_a_out);
   printf("alpha, beta\n");
   //seqMetroProcess<<<nBlocks,1>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials);
   seqMetroProcess<<<nBlocks,nThreads>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials);
-  
+
   int *tDot;
   CUDA_CALL(cudaMalloc((void **)&tDot,trials*sizeof(int)));
 
@@ -153,8 +155,8 @@ int main(int argc, char **argv){
 
   sampleTdot<<<nBlocks,nThreads>>>(trials, tDot,devStates);
 
-
-  mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates,nBlocks);
+  int numThreads = trials / nBlocks;  // TODO: This should be a whole number
+  mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads);
  /*------ Free Memory -------------------------------------------*/
 
   free(y);
@@ -173,30 +175,43 @@ __global__ void sampleTdot(int trials, int *tDot,curandState *state){
    int id = threadIdx.x + blockIdx.x * blockDim.x;
    int u = (curand(&state[id]) % (trials-1)) + 1;
    //printf("thread %d sample: %d",id,u);
-   tDot[id] = u; 
+   tDot[id] = u;
 }
 
 
-__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot, curandState *state,int nBlocks){
-   
-/*   printf("\n all blocks finished\n");
-   for(int j = 0; j < trials ; j++) {
-        printf(" %f ", *dev_a_out);
-        printf(" %f \n",*dev_b_out);
-        dev_a_out++;
-        dev_b_out++;
-   }
-*/
+__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads){
 
- printf("\n all blocks finished\n");
-   for(int j = 0; j < nBlocks*2 ; j++) {
-       // printf(" %f ", *dev_a_out);
-        printf(" %d \n",*tDot);
-        tDot++;
-        //dev_b_out++;
-   }
+ // printf("\n all blocks finished\n");
+ //   for(int j = 0; j < nBlocks*2 ; j++) {
+ //       // printf(" %f ", *dev_a_out);
+ //        printf(" %d \n",*tDot);
+ //        tDot++;
+ //        //dev_b_out++;
+ //   }
 
+  int M = nBlocks * nThreads;
+  int id = threadIdx.x + blockIdx.x * blockDim.x;
+  float h = id^(-1/(4+2));
+
+  for (int i=0; i < M; i++) {
+    int *cDot;
+    CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice));
+    int cm = (curand(&state[id]) % (trials-1)) + 1;
+    int u = curand_uniform(&state[id]);
+    // TODO: Compute wcDot
+    // TODO: Compute WtDot
+    if ()
+  }
+}
 
+__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) {
+  float sum;
+  for (int i=0; i < M; i++) {
+    int index =  tDot[i] + i * nThreads;  // trial m of posterior m (note: i = blockId)
+    sum += dev_x_out[index];  // posterior_m_tm
+  }
+  float mean = sum/M;
+  return mean;
 }
 
 
@@ -218,29 +233,29 @@ __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *
   }
 }
 
-__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, 
-                             float *theta, float *log_theta, 
+__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state,
+                             float *theta, float *log_theta,
                              float a, float b,float *dev_a_out,float *dev_b_out, int trials){
   /*------ MCMC ----------------------------------------------------*/
   int i;
   //printf("K: %d \n ",K);
   int id = threadIdx.x + blockIdx.x * blockDim.x;
-  
+
   int nTasks = nBlocks*blockDim.x;
   //int start = blockIdx.x * K/nBlocks;
   int start = id * K/nTasks;
-  int lengthPerBlock = K/nTasks;   
+  int lengthPerBlock = K/nTasks;
   //partition the data
   int *yy;
   yy = y+start;
   float *nn;
   nn = n+start;
-  
+
   float *sTheta;
   sTheta = theta+start;
   float *sLogTheta;
-  sLogTheta = log_theta+start; 
- /* 
+  sLogTheta = log_theta+start;
+ /*
   printf("length per block:%d\n",lengthPerBlock);
   printf("start is:%d\n",start);
   printf("partial data under block id: %d \n ",blockIdx.x);
@@ -252,7 +267,7 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
 
 
 
-    
+
   /* Steps of MCMC */
   int rstart = id*trials/nTasks;
   int rlenPerTask = trials;
@@ -260,12 +275,12 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
   subA = dev_a_out+rstart;
   float *subB;
   subB = dev_b_out+rstart;
-  
-  
-  for(i = 0; i < rlenPerTask; i++){    
+
+
+  for(i = 0; i < rlenPerTask; i++){
     //sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K);
       sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state);
-    
+
       float flat_sum=0;
       for(int ii=0;ii<lengthPerBlock;ii++){
          //*ptr refers to the value at address
@@ -281,14 +296,14 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
       }
 //printf("sum of theta is %f\n",flat_sum);
 //printf("sum of logtheta is %f\n",sum_logs);
-      
+
     /* Sample hyperparameters. */
       a = sample_a(state,id, a, b, lengthPerBlock, sum_logs);
       b = sample_b(state,id, a, lengthPerBlock, flat_sum);
-    
+
     /* print hyperparameters. */
     //printf("%f, %f\n", a, b);
-    ////////////save new output to the global array 
+    ////////////save new output to the global array
     *subA = a;
     subA++;
     *subB = b;
@@ -296,7 +311,7 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
 
   }
 }
- 
+
 
 /*
  *  Read in data.
@@ -305,8 +320,8 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
 __host__ void load_data(int argc, char **argv, int *K, int **y, float **n){
   int k;
   char line[128];
-  FILE *fp;    
-    
+  FILE *fp;
+
   if(argc > 1){
     fp = fopen(argv[1], "r");
   } else {
@@ -321,33 +336,33 @@ __host__ void load_data(int argc, char **argv, int *K, int **y, float **n){
 
   *K = 0;
   while( fgets (line, sizeof line, fp) != NULL )
-    (*K)++; 
+    (*K)++;
 
   rewind(fp);
 
   *y = (int*) malloc((*K) * sizeof(int));
-  *n = (float*) malloc((*K) * sizeof(float)); 
-  
+  *n = (float*) malloc((*K) * sizeof(float));
+
   for(k = 0; k < *K; k++)
-    fscanf(fp, "%d %f", *y + k, *n + k);    
- 
+    fscanf(fp, "%d %f", *y + k, *n + k);
+
   fclose(fp);
 }
 
 
 /*
- *  Metropolis algorithm for producing random a values. 
+ *  Metropolis algorithm for producing random a values.
  *  The proposal distribution in normal with a variance that
  *  is adjusted at each step.
  */
- 
+
 __device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs){
 
   float sigma = 2.0;
   float norm = curand_normal(&state[id]);
   float U, log_acceptance_ratio, proposal = norm * sigma + a;
   //printf("rnorm result:%f \n",norm*sigma);
-  if(proposal <= 0) 
+  if(proposal <= 0)
     return a;
 
   log_acceptance_ratio = (proposal - a) * sum_logs +
@@ -380,7 +395,7 @@ __device__ float sample_b(curandState *state,int id, float a, int K, float flat_
 }
 
 
-/* 
+/*
  *  Box-Muller Transformation: Generate one standard normal variable.
  *
  *  This algorithm can be more efficiently used by producing two
@@ -403,7 +418,7 @@ printf("host RANDMAX result:%f \n",RAND_MAX);
   return V1;
 }
 
- 
+
 __device__ float rnorm(curandState *state,int id){
 
   float U1 = curand_uniform(&state[id]);
@@ -423,7 +438,7 @@ printf("RANDMAX result:%f \n",RAND_MAX);
  *  See device rgamma function. This is probably not the
  *   fastest way to generate random gamma variables on a CPU.
  */
- 
+
 __host__ float rgamma(float a, float b){
 
   float d = a - 1.0 / 3;
@@ -433,26 +448,26 @@ __host__ float rgamma(float a, float b){
     Y = rnorm();
     v = pow((1 + Y / sqrt(9 * d)), 3);
 
-    // Necessary to avoid taking the log of a negative number later. 
-    if(v <= 0) 
+    // Necessary to avoid taking the log of a negative number later.
+    if(v <= 0)
       continue;
-        
+
     U = rand() / float(RAND_MAX);
 
-    // Accept the sample under the following condition. 
-    //   Otherwise repeat loop. 
+    // Accept the sample under the following condition.
+    //   Otherwise repeat loop.
     if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
             return d * v / b;
   }
 }
 
-/* 
- *  Generate a single Gamma distributed random variable by the Marsoglia 
+/*
+ *  Generate a single Gamma distributed random variable by the Marsoglia
  *  algorithm (George Marsaglia, Wai Wan Tsang; 2001).
  *
  *  Zeb chose this algorithm because it has a very high acceptance rate (>96%),
- *  so this while loop will usually only need to run a few times. Many other 
- *  algorithms, while perhaps faster on a CPU, have acceptance rates on the 
+ *  so this while loop will usually only need to run a few times. Many other
+ *  algorithms, while perhaps faster on a CPU, have acceptance rates on the
  *  order of 50% (very bad in a massively parallel context).
  */
 
@@ -461,17 +476,17 @@ __device__ float rgamma(curandState *state, int id, float a, float b){
   float d = a - 1.0 / 3;
   float Y, U, v;
 
-  while(1){   
+  while(1){
     Y = curand_normal(&state[id]);
     v = pow((1 + Y / sqrt(9 * d)), 3);
 
     /* Necessary to avoid taking the log of a negative number later. */
-    if(v <= 0) 
+    if(v <= 0)
       continue;
-        
+
     U = curand_uniform(&state[id]);
 
-    /* Accept the sample under the following condition. 
+    /* Accept the sample under the following condition.
        Otherwise repeat loop. */
     if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
       return d * v / b;
@@ -482,14 +497,14 @@ __device__ float rgamma(curandState *state, int id, float a, float b){
 /*
  *  Sample each theta from the appropriate gamma distribution
  */
- 
-__global__ void sample_theta(curandState *state, 
-                             float *theta, float *log_theta, int *y, float *n, 
+
+__global__ void sample_theta(curandState *state,
+                             float *theta, float *log_theta, int *y, float *n,
                              float a, float b, int K){
-                             
+
   int id = threadIdx.x + blockIdx.x * blockDim.x;
   float hyperA, hyperB;
-    
+
   if(id < K){
     hyperA = a + y[id];
     hyperB = b + n[id];
@@ -499,14 +514,14 @@ __global__ void sample_theta(curandState *state,
 }
 
 
-/* 
- *  Initialize GPU random number generators 
+/*
+ *  Initialize GPU random number generators
  */
- 
+
 __global__ void setup_kernel(curandState *state, unsigned int seed, int K){
 
   int id = threadIdx.x + blockIdx.x * blockDim.x;
-  
+
   if(id < K)
     curand_init(seed, id, 0, &state[id]);
-} 
+}