A version that could run! lol

cudahpcproj · Apr 20, 2017 · fd678a1 · fd678a1
1 parent 81d0a83
commit fd678a1
Showing 1 changed file with 49 additions and 26 deletions.
diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu
@@ -67,10 +67,10 @@ specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1
 
 __host__ void load_data(int argc, char **argv, int *K, int **y, float **n);
 
-__host__ float sample_a(float a, float b, int K, float sum_logs);
-__host__ float sample_b(float a, int K, float flat_sum);
-
+__device__ float sample_a(curandState *state, float a, float b, int K, float sum_logs);
+__device__ float sample_b(curandState *state, float a, int K, float flat_sum);
 __host__ float rnorm();
+__device__ float rnorm(curandState *state );
 __host__ float rgamma(float a, float b);
 
 __device__ float rgamma(curandState *state, int id, float a, float b);
@@ -89,8 +89,8 @@ __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *
 int main(int argc, char **argv){
 
   curandState *devStates;
-  float a, b, flat_sum, sum_logs, *n, *dev_n, *dev_theta, *dev_log_theta;
-  int i, K, *y, *dev_y, nBlocks, trials = 1000;
+  float a, b, *n, *dev_n, *dev_theta, *dev_log_theta;
+  int  K, *y, *dev_y, nBlocks, trials = 1000;
 
   if(argc > 2)
     trials = atoi(argv[2]);
@@ -167,14 +167,19 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
   int start = blockIdx.x * K/nBlocks;
   int lengthPerBlock = K/nBlocks;   
   //partition the data
-  int *yy = &y[start];
-  float *nn = &n[start];
-  float *sTheta = &theta[start];
-  float *sLogTheta = &log_theta[start]; 
+  int *yy;
+  yy = &y[start];
+  float *nn;
+  nn = &n[start];
+
+  float *sTheta;
+  sTheta = &theta[start];
+  float *sLogTheta;
+  sLogTheta = &log_theta[start]; 
 
   printf("block id:%d\n",blockIdx.x);
   for(int j = 0; j < lengthPerBlock ; j++) {
-        printf("%d ", yy[j]);
+        printf("%d\n ", yy[j]);
     }
   printf("alpha, beta\n");
 
@@ -185,18 +190,26 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
 
   for(i = 0; i < trials; i++){    
     //sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K);
-      sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, K, state);
-       /* Make iterators for thetas and log thetas. */
-   // thrust::device_ptr<float> theta(dev_theta);
-   // thrust::device_ptr<float> log_theta(dev_log_theta);
+      sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state);
 
-    /* Compute pairwise sums of thetas and log_thetas. */
-   // flat_sum = thrust::reduce(theta, theta + K);
-   // sum_logs = thrust::reduce(log_theta, log_theta + K);
-
+      float flat_sum=0;
+      for(int ii=0;ii<lengthPerBlock;ii++){
+         //*ptr refers to the value at address
+         flat_sum = flat_sum + *sTheta;
+         sTheta++;
+      }
+
+      float sum_logs=0;
+      for(int ii=0;ii<lengthPerBlock;ii++){
+         //*ptr refers to the value at address
+         sum_logs = sum_logs + *sLogTheta;
+         sLogTheta++;
+      }
+
+
     /* Sample hyperparameters. */
-   // a = sample_a(a, b, K, sum_logs);
-   // b = sample_b(a, K, flat_sum);
+      a = sample_a(state, a, b, lengthPerBlock, sum_logs);
+      b = sample_b(state, a, lengthPerBlock, flat_sum);
 
     /* print hyperparameters. */
     printf("%f, %f\n", a, b); 
@@ -247,10 +260,10 @@ __host__ void load_data(int argc, char **argv, int *K, int **y, float **n){
  *  is adjusted at each step.
  */
 
-__host__ float sample_a(float a, float b, int K, float sum_logs){
+__device__ float sample_a(curandState *state, float a, float b, int K, float sum_logs){
 
   static float sigma = 2;
-  float U, log_acceptance_ratio, proposal = rnorm() * sigma + a;
+  float U, log_acceptance_ratio, proposal = rnorm(state) * sigma + a;
 
   if(proposal <= 0) 
     return a;
@@ -259,7 +272,7 @@ __host__ float sample_a(float a, float b, int K, float sum_logs){
                          K * (proposal - a) * log(b) -
                          K * (lgamma(proposal) - lgamma(a));
 
-  U = rand() / float(RAND_MAX);
+  U = curand(state) / float(RAND_MAX);
 
   if(log(U) < log_acceptance_ratio){
     sigma *= 1.1;
@@ -275,11 +288,11 @@ __host__ float sample_a(float a, float b, int K, float sum_logs){
  *  Sample b from a gamma distribution.
  */
 
-__host__ float sample_b(float a, int K, float flat_sum){
+__device__ float sample_b(curandState *state, float a, int K, float flat_sum){
 
   float hyperA = K * a + 1;
   float hyperB = flat_sum;
-  return rgamma(hyperA, hyperB);
+  return rgamma(state,0,hyperA, hyperB);
 }
 
 
@@ -293,7 +306,7 @@ __host__ float sample_b(float a, int K, float flat_sum){
  *  This is actually the algorithm chosen by NVIDIA to calculate
  *  normal random variables on the GPU.
  */
- 
+
 __host__ float rnorm(){
 
   float U1 = rand() / float(RAND_MAX);
@@ -303,6 +316,16 @@ __host__ float rnorm(){
   return V1;
 }
 
+
+__device__ float rnorm(curandState *state){
+
+  float U1 = curand(state) / float(RAND_MAX);
+  float U2 = curand(state) / float(RAND_MAX);
+  float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
+  /* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
+  return V1;
+}
+
 
 /*
  *  See device rgamma function. This is probably not the