From 3e8a6419373ca515df0d132a48423089834df64a Mon Sep 17 00:00:00 2001
From: Reynaldo Morillo <reynaldo.morillo@uconn.edu>
Date: Sun, 30 Apr 2017 13:49:24 -0400
Subject: [PATCH] Made a performance enhancement to para_gibbs, by removing the
 copying of tDot to make cDot. Instead just replace the the values that differ
 them when necessary.

---
 parallel/para_gibbs.cu | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu
index f9a1f5b..3d5e144 100644
--- a/parallel/para_gibbs.cu
+++ b/parallel/para_gibbs.cu
@@ -176,7 +176,7 @@ int main(int argc, char **argv){
 
   free(y);
   free(n);
-  
+
   CUDA_CALL(cudaFree(devStates));
   CUDA_CALL(cudaFree(dev_theta));
   CUDA_CALL(cudaFree(dev_log_theta));
@@ -216,21 +216,27 @@ __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int
   int M = nBlocks * nThreads;
   int id = threadIdx.x + blockIdx.x * blockDim.x;
 
-  for (int i=0; i < trials; i++) { 
+  for (int i=0; i < trials; i++) {
     h[i] = powf(i,(-1/(4+2)));
     for (int m=0; m < M; m++) {
       int *cDot = (int*) malloc(M*sizeof(int));
-  //printf("%d\n",m);
- 
-      memcpy(cDot, tDot, sizeof(int) * M);
-      cDot[m] = (curand(&state[id]) % (trials-1)) + 1;
+
+      // Performance modiciation made
+      // We are not going to make a copy of tDot to make cDot, rather
+      // rather just replace the one value that differes them when necessary
+
+      int tDot_val_at_m = tDot[m];
+      int cDot_val_at_m = (curand(&state[id]) % (trials-1)) + 1;
+      tDot[m] =  cDot_val_at_m;  // At this point tDot is actually cDot
       int u = curand_uniform(&state[id]);
-      float wcDot = computeW(cDot, dev_a_out, dev_b_out, M, trials, h[i]);
-      float wtDot = computeW(tDot, dev_a_out, dev_b_out, M, trials, h[i]);
+      float wcDot = computeW(tDot, dev_a_out, dev_b_out, M, trials, h[i]);  // Note: tDot is actually cDot
+      // Switch back to tDot
+      tDot[m] = tDot_val_at_m;
+      float wtDot = computeW(tDot, dev_a_out, dev_b_out, M, trials, h[i]);  // This is using tDot
       if (u < (wcDot/ wtDot) ){
-        memcpy(tDot, cDot, sizeof(int) * trials);
+        // Retrun tDot to the cDot configuration
+        tDot[m] = cDot_val_at_m;
       }
-     free(cDot);
     }
     // TODO: Draw from Multivariate Normal, and Save into the results
     float posterior_mean_a = posteriorMean(tDot, dev_a_out, M, trials);
@@ -281,7 +287,7 @@ __device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M,
 }
 
 __device__ float normPDF(float x, float mean, float variance) {
-  
+
   float denominator = sqrtf(2*PI*(variance*variance));
   //printf("the denominator is %f\n",PI);
   float numerator = expf( -1 * (x-mean)*(x-mean) / (2*variance*variance) );