para_gibbs.cu

/*
Created by Zebulun Arendsee.
March 26, 2013

Modified by Will Landau.
June 30, 2013
will-landau.com
landau@iastate.edu

This program implements a MCMC algorithm for the following hierarchical
model:

y_k     ~ Poisson(n_k * theta_k)     k = 1, ..., K
theta_k ~ Gamma(a, b)
a       ~ Unif(0, a0)
b       ~ Unif(0, b0)

We let a0 and b0 be arbitrarily large.

Arguments:
    1) input filename
        With two space delimited columns holding integer values for
        y and float values for n.
    2) number of trials (1000 by default)

Output: A comma delimited file containing a column for a, b, and each
theta. All output is written to stdout.

Example dataset:

$ head -3 data.txt
4 0.91643
23 3.23709
7 0.40103

Example of compilation and execution:

$ nvcc gibbs_metropolis.cu -o gibbs
$ ./gibbs mydata.txt 2500 > output.csv
$

This code borrows from the nVidia developer zone documentation,
specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1
*/

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <curand_kernel.h>
#include <thrust/reduce.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>

#define PI 3.14159265359f
#define THREADS_PER_BLOCK 64

#define CUDA_CALL(x) {if((x) != cudaSuccess){ \
  printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \
  printf("  %s\n", cudaGetErrorString(cudaGetLastError())); \
  exit(EXIT_FAILURE);}}

#define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \
  printf("Error at %s:%d\n",__FILE__,__LINE__); \
  printf("  %s\n", cudaGetErrorString(cudaGetLastError())); \
  exit(EXIT_FAILURE);}}

__host__ void load_data(int argc, char **argv, int *K, int **y, float **n);

__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs);
__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum);
__host__ float rnorm();
__device__ float rnorm(curandState *state,int id );
__host__ float rgamma(float a, float b);

__device__ float rgamma(curandState *state, int id, float a, float b);

__global__ void sample_theta(curandState *state, float *theta, float *log_theta,
                             int *y, float *n, float a, float b, int K);
__global__ void setup_kernel(curandState *state, unsigned int seed, int);


__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state,
                                float *theta, float *log_theta,
                                float a, float b,float *dev_a_out,float *dev_b_out,
                                int trials);
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n,
                             float a, float b, int K, curandState *state);

__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks);

__global__ void sampleTdot(int trials, int *tDot,curandState *state);

int main(int argc, char **argv){

  curandState *devStates;
  float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out;
  int  K, *y, *dev_y, nBlocks,nThreads, trials = 1000;

  if(argc>4){
    trials = atoi(argv[2]);
    nBlocks = atoi(argv[3]);
    nThreads = atoi(argv[4]);
  }
  else if(argc > 2){
    trials = atoi(argv[2]);
    nBlocks = 64;
    nThreads = 1;
  }

  load_data(argc, argv, &K, &y, &n);

  /* starting values of hyperparameters */
  a = 20;
  b = 1;

  /*------ Allocate memory -----------------------------------------*/

  CUDA_CALL(cudaMalloc((void **)&dev_y, K * sizeof(int)));
  CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int),
            cudaMemcpyHostToDevice));

  CUDA_CALL(cudaMalloc((void **)&dev_n, K * sizeof(float)));
  CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float),
            cudaMemcpyHostToDevice));

  /* Allocate space for theta and log_theta on device and host */
  CUDA_CALL(cudaMalloc((void **)&dev_theta, K * sizeof(float)));
  CUDA_CALL(cudaMalloc((void **)&dev_log_theta, K * sizeof(float)));

  /* Allocate space for random states on device */
  CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState)));
  /////////////////////////allocate space on dev memory to save a and b
  CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float)));
  CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float)));

  /*------ Setup random number generators (one per thread) ---------*/

  //nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
  //nBlocks = 1;
  setup_kernel<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, 0, K);

//printf("%f",&dev_a_out);
  printf("alpha, beta\n");
  //seqMetroProcess<<<nBlocks,1>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials);
  seqMetroProcess<<<nBlocks,nThreads>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials);

  int *tDot;
  CUDA_CALL(cudaMalloc((void **)&tDot,trials*sizeof(int)));

  float *h;
  CUDA_CALL(cudaMalloc((void **)&h,trials*sizeof(float)));

  sampleTdot<<<nBlocks,nThreads>>>(trials, tDot,devStates);


  mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates,nBlocks);
 /*------ Free Memory -------------------------------------------*/

  free(y);
  free(n);

  CUDA_CALL(cudaFree(devStates));
  CUDA_CALL(cudaFree(dev_theta));
  CUDA_CALL(cudaFree(dev_log_theta));
  CUDA_CALL(cudaFree(dev_y));
  CUDA_CALL(cudaFree(dev_n));

  return EXIT_SUCCESS;
}

__global__ void sampleTdot(int trials, int *tDot,curandState *state){
   int id = threadIdx.x + blockIdx.x * blockDim.x;
   int u = (curand(&state[id]) % (trials-1)) + 1;
   //printf("thread %d sample: %d",id,u);
   tDot[id] = u;
}


__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot, curandState *state,int nBlocks){

/*   printf("\n all blocks finished\n");
   for(int j = 0; j < trials ; j++) {
        printf(" %f ", *dev_a_out);
        printf(" %f \n",*dev_b_out);
        dev_a_out++;
        dev_b_out++;
   }
*/

 printf("\n all blocks finished\n");
   for(int j = 0; j < nBlocks*2 ; j++) {
       // printf(" %f ", *dev_a_out);
        printf(" %d \n",*tDot);
        tDot++;
        //dev_b_out++;
   }


}


/*
 *  Sample each theta from the appropriate gamma distribution
 */

__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n,
                             float a, float b, int K, curandState *state){

  float hyperA, hyperB;

  for ( int i = 0; i < K; i++ ){
    hyperA = a + y[i];
    hyperB = b + n[i];
    theta[i] = rgamma(state,i,hyperA, hyperB);
    log_theta[i] = log(theta[i]);
  }
}

__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state,
                             float *theta, float *log_theta,
                             float a, float b,float *dev_a_out,float *dev_b_out, int trials){
  /*------ MCMC ----------------------------------------------------*/
  int i;
  //printf("K: %d \n ",K);
  int id = threadIdx.x + blockIdx.x * blockDim.x;

  int nTasks = nBlocks*blockDim.x;
  //int start = blockIdx.x * K/nBlocks;
  int start = id * K/nTasks;
  int lengthPerBlock = K/nTasks;
  //partition the data
  int *yy;
  yy = y+start;
  float *nn;
  nn = n+start;

  float *sTheta;
  sTheta = theta+start;
  float *sLogTheta;
  sLogTheta = log_theta+start;
 /*
  printf("length per block:%d\n",lengthPerBlock);
  printf("start is:%d\n",start);
  printf("partial data under block id: %d \n ",blockIdx.x);
  for(int j = 0; j < lengthPerBlock ; j++) {
        printf(" %d \n ", *yy);
        yy++;
   }
*/


  /* Steps of MCMC */
  int rstart = id*trials/nTasks;
  int rlenPerTask = trials;
  float *subA;
  subA = dev_a_out+rstart;
  float *subB;
  subB = dev_b_out+rstart;


  for(i = 0; i < rlenPerTask; i++){
    //sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K);
      sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state);

      float flat_sum=0;
      for(int ii=0;ii<lengthPerBlock;ii++){
         //*ptr refers to the value at address
         flat_sum = flat_sum + *sTheta;
         sTheta++;
      }

      float sum_logs=0;
      for(int ii=0;ii<lengthPerBlock;ii++){
         //*ptr refers to the value at address
         sum_logs = sum_logs + *sLogTheta;
         sLogTheta++;
      }
//printf("sum of theta is %f\n",flat_sum);
//printf("sum of logtheta is %f\n",sum_logs);

    /* Sample hyperparameters. */
      a = sample_a(state,id, a, b, lengthPerBlock, sum_logs);
      b = sample_b(state,id, a, lengthPerBlock, flat_sum);

    /* print hyperparameters. */
    //printf("%f, %f\n", a, b);
    ////////////save new output to the global array
    *subA = a;
    subA++;
    *subB = b;
    subB++;

  }
}


/*
 *  Read in data.
 */

__host__ void load_data(int argc, char **argv, int *K, int **y, float **n){
  int k;
  char line[128];
  FILE *fp;

  if(argc > 1){
    fp = fopen(argv[1], "r");
  } else {
    printf("Please provide input filename\n");
    exit(EXIT_FAILURE);
  }

  if(fp == NULL){
    printf("Cannot read file \n");
    exit(EXIT_FAILURE);
  }

  *K = 0;
  while( fgets (line, sizeof line, fp) != NULL )
    (*K)++;

  rewind(fp);

  *y = (int*) malloc((*K) * sizeof(int));
  *n = (float*) malloc((*K) * sizeof(float));

  for(k = 0; k < *K; k++)
    fscanf(fp, "%d %f", *y + k, *n + k);

  fclose(fp);
}


/*
 *  Metropolis algorithm for producing random a values.
 *  The proposal distribution in normal with a variance that
 *  is adjusted at each step.
 */

__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs){

  float sigma = 2.0;
  float norm = curand_normal(&state[id]);
  float U, log_acceptance_ratio, proposal = norm * sigma + a;
  //printf("rnorm result:%f \n",norm*sigma);
  if(proposal <= 0)
    return a;

  log_acceptance_ratio = (proposal - a) * sum_logs +
                         K * (proposal - a) * log(b) -
                         K * (lgamma(proposal) - lgamma(a));

  U = curand_normal(&state[id]);
  //printf("log_acceptance result:%f \n",log_acceptance_ratio);
  //printf("log U result:%f \n",log(U));

  if(log(U) < log_acceptance_ratio){
    sigma *= 1.1;
    return proposal;
  } else {
    sigma /= 1.1;
    return a;
  }
}


/*
 *  Sample b from a gamma distribution.
 */

__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum){

  float hyperA = K * a + 1;
  float hyperB = flat_sum;
  return rgamma(state,id,hyperA, hyperB);
}


/*
 *  Box-Muller Transformation: Generate one standard normal variable.
 *
 *  This algorithm can be more efficiently used by producing two
 *  random normal variables. However, for the CPU, much faster
 *  algorithms are possible (e.g. the Ziggurat Algorithm);
 *
 *  This is actually the algorithm chosen by NVIDIA to calculate
 *  normal random variables on the GPU.
 */

__host__ float rnorm(){

  float U1 = rand() / float(RAND_MAX);
  float U2 = rand() / float(RAND_MAX);
/* printf("host U1 result:%f \n",U1); printf("host U2 result:%f \n",U2);
printf("host RANDMAX result:%f \n",RAND_MAX);
*/
  float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
  /* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
  return V1;
}


__device__ float rnorm(curandState *state,int id){

  float U1 = curand_uniform(&state[id]);
 /* printf("U1 result:%f \n",U1);
printf("RANDMAX result:%f \n",RAND_MAX);
*/
  float U2 = curand_uniform(&state[id]);
  //printf("U2 result:%f \n",U2);

  float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
  /* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
  return V1;
}


/*
 *  See device rgamma function. This is probably not the
 *   fastest way to generate random gamma variables on a CPU.
 */

__host__ float rgamma(float a, float b){

  float d = a - 1.0 / 3;
  float Y, U, v;

  while(1){
    Y = rnorm();
    v = pow((1 + Y / sqrt(9 * d)), 3);

    // Necessary to avoid taking the log of a negative number later.
    if(v <= 0)
      continue;

    U = rand() / float(RAND_MAX);

    // Accept the sample under the following condition.
    //   Otherwise repeat loop.
    if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
            return d * v / b;
  }
}

/*
 *  Generate a single Gamma distributed random variable by the Marsoglia
 *  algorithm (George Marsaglia, Wai Wan Tsang; 2001).
 *
 *  Zeb chose this algorithm because it has a very high acceptance rate (>96%),
 *  so this while loop will usually only need to run a few times. Many other
 *  algorithms, while perhaps faster on a CPU, have acceptance rates on the
 *  order of 50% (very bad in a massively parallel context).
 */

__device__ float rgamma(curandState *state, int id, float a, float b){

  float d = a - 1.0 / 3;
  float Y, U, v;

  while(1){
    Y = curand_normal(&state[id]);
    v = pow((1 + Y / sqrt(9 * d)), 3);

    /* Necessary to avoid taking the log of a negative number later. */
    if(v <= 0)
      continue;

    U = curand_uniform(&state[id]);

    /* Accept the sample under the following condition.
       Otherwise repeat loop. */
    if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
      return d * v / b;
  }
}


/*
 *  Sample each theta from the appropriate gamma distribution
 */

__global__ void sample_theta(curandState *state,
                             float *theta, float *log_theta, int *y, float *n,
                             float a, float b, int K){

  int id = threadIdx.x + blockIdx.x * blockDim.x;
  float hyperA, hyperB;

  if(id < K){
    hyperA = a + y[id];
    hyperB = b + n[id];
    theta[id] = rgamma(state, id, hyperA, hyperB);
    log_theta[id] = log(theta[id]);
  }
}


/*
 *  Initialize GPU random number generators
 */

__global__ void setup_kernel(curandState *state, unsigned int seed, int K){

  int id = threadIdx.x + blockIdx.x * blockDim.x;

  if(id < K)
    curand_init(seed, id, 0, &state[id]);
}
	/*
	Created by Zebulun Arendsee.
	March 26, 2013

	Modified by Will Landau.
	June 30, 2013
	will-landau.com
	landau@iastate.edu

	This program implements a MCMC algorithm for the following hierarchical
	model:

	y_k ~ Poisson(n_k * theta_k) k = 1, ..., K
	theta_k ~ Gamma(a, b)
	a ~ Unif(0, a0)
	b ~ Unif(0, b0)

	We let a0 and b0 be arbitrarily large.

	Arguments:
	1) input filename
	With two space delimited columns holding integer values for
	y and float values for n.
	2) number of trials (1000 by default)

	Output: A comma delimited file containing a column for a, b, and each
	theta. All output is written to stdout.

	Example dataset:

	$ head -3 data.txt
	4 0.91643
	23 3.23709
	7 0.40103

	Example of compilation and execution:

	$ nvcc gibbs_metropolis.cu -o gibbs
	$ ./gibbs mydata.txt 2500 > output.csv
	$

	This code borrows from the nVidia developer zone documentation,
	specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1
	*/

	#include <stdio.h>
	#include <stdlib.h>
	#include <cuda.h>
	#include <math.h>
	#include <curand_kernel.h>
	#include <thrust/reduce.h>
	#include <thrust/device_ptr.h>
	#include <thrust/device_vector.h>

	#define PI 3.14159265359f
	#define THREADS_PER_BLOCK 64

	#define CUDA_CALL(x) {if((x) != cudaSuccess){ \
	printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \
	printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \
	exit(EXIT_FAILURE);}}

	#define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \
	printf("Error at %s:%d\n",__FILE__,__LINE__); \
	printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \
	exit(EXIT_FAILURE);}}

	__host__ void load_data(int argc, char *argv, int K, int y, float n);

	__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs);
	__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum);
	__host__ float rnorm();
	__device__ float rnorm(curandState *state,int id );
	__host__ float rgamma(float a, float b);

	__device__ float rgamma(curandState *state, int id, float a, float b);

	__global__ void sample_theta(curandState state, float theta, float *log_theta,
	int y, float n, float a, float b, int K);
	__global__ void setup_kernel(curandState *state, unsigned int seed, int);


	__global__ void seqMetroProcess(int K, int nBlocks, int y, float n, curandState *state,
	float theta, float log_theta,
	float a, float b,float dev_a_out,float dev_b_out,
	int trials);
	__device__ void sample_theta_seq(float theta, float log_theta, int y, float n,
	float a, float b, int K, curandState *state);

	__global__ void mergePosterior(int trials, float dev_a_out,float dev_b_out,int tDot,curandState state, int nBlocks);

	__global__ void sampleTdot(int trials, int tDot,curandState state);

	int main(int argc, char **argv){

	curandState *devStates;
	float a, b, n, dev_n, dev_theta, dev_log_theta, dev_a_out, dev_b_out;
	int K, y, dev_y, nBlocks,nThreads, trials = 1000;

	if(argc>4){
	trials = atoi(argv[2]);
	nBlocks = atoi(argv[3]);
	nThreads = atoi(argv[4]);
	}
	else if(argc > 2){
	trials = atoi(argv[2]);
	nBlocks = 64;
	nThreads = 1;
	}

	load_data(argc, argv, &K, &y, &n);

	/* starting values of hyperparameters */
	a = 20;
	b = 1;

	/------ Allocate memory -----------------------------------------/

	CUDA_CALL(cudaMalloc((void *)&dev_y, K sizeof(int)));
	CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int),
	cudaMemcpyHostToDevice));

	CUDA_CALL(cudaMalloc((void *)&dev_n, K sizeof(float)));
	CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float),
	cudaMemcpyHostToDevice));

	/* Allocate space for theta and log_theta on device and host */
	CUDA_CALL(cudaMalloc((void *)&dev_theta, K sizeof(float)));
	CUDA_CALL(cudaMalloc((void *)&dev_log_theta, K sizeof(float)));

	/* Allocate space for random states on device */
	CUDA_CALL(cudaMalloc((void *)&devStates, K sizeof(curandState)));
	/////////////////////////allocate space on dev memory to save a and b
	CUDA_CALL(cudaMalloc((void *)&dev_a_out,nBlocksnThreadstrialssizeof(float)));
	CUDA_CALL(cudaMalloc((void *)&dev_b_out,nBlocksnThreadstrialssizeof(float)));

	/------ Setup random number generators (one per thread) ---------/

	//nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
	//nBlocks = 1;
	setup_kernel<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, 0, K);

	//printf("%f",&dev_a_out);
	printf("alpha, beta\n");
	//seqMetroProcess<<<nBlocks,1>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials);
	seqMetroProcess<<<nBlocks,nThreads>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials);

	int *tDot;
	CUDA_CALL(cudaMalloc((void *)&tDot,trialssizeof(int)));

	float *h;
	CUDA_CALL(cudaMalloc((void *)&h,trialssizeof(float)));

	sampleTdot<<<nBlocks,nThreads>>>(trials, tDot,devStates);


	mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates,nBlocks);
	/------ Free Memory -------------------------------------------/

	free(y);
	free(n);

	CUDA_CALL(cudaFree(devStates));
	CUDA_CALL(cudaFree(dev_theta));
	CUDA_CALL(cudaFree(dev_log_theta));
	CUDA_CALL(cudaFree(dev_y));
	CUDA_CALL(cudaFree(dev_n));

	return EXIT_SUCCESS;
	}

	__global__ void sampleTdot(int trials, int tDot,curandState state){
	int id = threadIdx.x + blockIdx.x * blockDim.x;
	int u = (curand(&state[id]) % (trials-1)) + 1;
	//printf("thread %d sample: %d",id,u);
	tDot[id] = u;
	}


	__global__ void mergePosterior(int trials, float dev_a_out,float dev_b_out,int tDot, curandState state,int nBlocks){

	/* printf("\n all blocks finished\n");
	for(int j = 0; j < trials ; j++) {
	printf(" %f ", *dev_a_out);
	printf(" %f \n",*dev_b_out);
	dev_a_out++;
	dev_b_out++;
	}
	*/

	printf("\n all blocks finished\n");
	for(int j = 0; j < nBlocks*2 ; j++) {
	// printf(" %f ", *dev_a_out);
	printf(" %d \n",*tDot);
	tDot++;
	//dev_b_out++;
	}


	}



	/*
	* Sample each theta from the appropriate gamma distribution
	*/

	__device__ void sample_theta_seq(float theta, float log_theta, int y, float n,
	float a, float b, int K, curandState *state){

	float hyperA, hyperB;

	for ( int i = 0; i < K; i++ ){
	hyperA = a + y[i];
	hyperB = b + n[i];
	theta[i] = rgamma(state,i,hyperA, hyperB);
	log_theta[i] = log(theta[i]);
	}
	}

	__global__ void seqMetroProcess(int K, int nBlocks, int y, float n, curandState *state,
	float theta, float log_theta,
	float a, float b,float dev_a_out,float dev_b_out, int trials){
	/------ MCMC ----------------------------------------------------/
	int i;
	//printf("K: %d \n ",K);
	int id = threadIdx.x + blockIdx.x * blockDim.x;

	int nTasks = nBlocks*blockDim.x;
	//int start = blockIdx.x * K/nBlocks;
	int start = id * K/nTasks;
	int lengthPerBlock = K/nTasks;
	//partition the data
	int *yy;
	yy = y+start;
	float *nn;
	nn = n+start;

	float *sTheta;
	sTheta = theta+start;
	float *sLogTheta;
	sLogTheta = log_theta+start;
	/*
	printf("length per block:%d\n",lengthPerBlock);
	printf("start is:%d\n",start);
	printf("partial data under block id: %d \n ",blockIdx.x);
	for(int j = 0; j < lengthPerBlock ; j++) {
	printf(" %d \n ", *yy);
	yy++;
	}
	*/




	/* Steps of MCMC */
	int rstart = id*trials/nTasks;
	int rlenPerTask = trials;
	float *subA;
	subA = dev_a_out+rstart;
	float *subB;
	subB = dev_b_out+rstart;


	for(i = 0; i < rlenPerTask; i++){
	//sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K);
	sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state);

	float flat_sum=0;
	for(int ii=0;ii<lengthPerBlock;ii++){
	//*ptr refers to the value at address
	flat_sum = flat_sum + *sTheta;
	sTheta++;
	}

	float sum_logs=0;
	for(int ii=0;ii<lengthPerBlock;ii++){
	//*ptr refers to the value at address
	sum_logs = sum_logs + *sLogTheta;
	sLogTheta++;
	}
	//printf("sum of theta is %f\n",flat_sum);
	//printf("sum of logtheta is %f\n",sum_logs);

	/* Sample hyperparameters. */
	a = sample_a(state,id, a, b, lengthPerBlock, sum_logs);
	b = sample_b(state,id, a, lengthPerBlock, flat_sum);

	/* print hyperparameters. */
	//printf("%f, %f\n", a, b);
	////////////save new output to the global array
	*subA = a;
	subA++;
	*subB = b;
	subB++;

	}
	}


	/*
	* Read in data.
	*/

	__host__ void load_data(int argc, char *argv, int K, int y, float n){
	int k;
	char line[128];
	FILE *fp;

	if(argc > 1){
	fp = fopen(argv[1], "r");
	} else {
	printf("Please provide input filename\n");
	exit(EXIT_FAILURE);
	}

	if(fp == NULL){
	printf("Cannot read file \n");
	exit(EXIT_FAILURE);
	}

	*K = 0;
	while( fgets (line, sizeof line, fp) != NULL )
	(*K)++;

	rewind(fp);

	y = (int) malloc((K) sizeof(int));
	n = (float) malloc((K) sizeof(float));

	for(k = 0; k < *K; k++)
	fscanf(fp, "%d %f", y + k, n + k);

	fclose(fp);
	}


	/*
	* Metropolis algorithm for producing random a values.
	* The proposal distribution in normal with a variance that
	* is adjusted at each step.
	*/

	__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs){

	float sigma = 2.0;
	float norm = curand_normal(&state[id]);
	float U, log_acceptance_ratio, proposal = norm * sigma + a;
	//printf("rnorm result:%f \n",norm*sigma);
	if(proposal <= 0)
	return a;

	log_acceptance_ratio = (proposal - a) * sum_logs +
	K * (proposal - a) * log(b) -
	K * (lgamma(proposal) - lgamma(a));

	U = curand_normal(&state[id]);
	//printf("log_acceptance result:%f \n",log_acceptance_ratio);
	//printf("log U result:%f \n",log(U));

	if(log(U) < log_acceptance_ratio){
	sigma *= 1.1;
	return proposal;
	} else {
	sigma /= 1.1;
	return a;
	}
	}


	/*
	* Sample b from a gamma distribution.
	*/

	__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum){

	float hyperA = K * a + 1;
	float hyperB = flat_sum;
	return rgamma(state,id,hyperA, hyperB);
	}


	/*
	* Box-Muller Transformation: Generate one standard normal variable.
	*
	* This algorithm can be more efficiently used by producing two
	* random normal variables. However, for the CPU, much faster
	* algorithms are possible (e.g. the Ziggurat Algorithm);
	*
	* This is actually the algorithm chosen by NVIDIA to calculate
	* normal random variables on the GPU.
	*/

	__host__ float rnorm(){

	float U1 = rand() / float(RAND_MAX);
	float U2 = rand() / float(RAND_MAX);
	/* printf("host U1 result:%f \n",U1); printf("host U2 result:%f \n",U2);
	printf("host RANDMAX result:%f \n",RAND_MAX);
	*/
	float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
	/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
	return V1;
	}


	__device__ float rnorm(curandState *state,int id){

	float U1 = curand_uniform(&state[id]);
	/* printf("U1 result:%f \n",U1);
	printf("RANDMAX result:%f \n",RAND_MAX);
	*/
	float U2 = curand_uniform(&state[id]);
	//printf("U2 result:%f \n",U2);

	float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
	/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
	return V1;
	}


	/*
	* See device rgamma function. This is probably not the
	* fastest way to generate random gamma variables on a CPU.
	*/

	__host__ float rgamma(float a, float b){

	float d = a - 1.0 / 3;
	float Y, U, v;

	while(1){
	Y = rnorm();
	v = pow((1 + Y / sqrt(9 * d)), 3);

	// Necessary to avoid taking the log of a negative number later.
	if(v <= 0)
	continue;

	U = rand() / float(RAND_MAX);

	// Accept the sample under the following condition.
	// Otherwise repeat loop.
	if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
	return d * v / b;
	}
	}

	/*
	* Generate a single Gamma distributed random variable by the Marsoglia
	* algorithm (George Marsaglia, Wai Wan Tsang; 2001).
	*
	* Zeb chose this algorithm because it has a very high acceptance rate (>96%),
	* so this while loop will usually only need to run a few times. Many other
	* algorithms, while perhaps faster on a CPU, have acceptance rates on the
	* order of 50% (very bad in a massively parallel context).
	*/

	__device__ float rgamma(curandState *state, int id, float a, float b){

	float d = a - 1.0 / 3;
	float Y, U, v;

	while(1){
	Y = curand_normal(&state[id]);
	v = pow((1 + Y / sqrt(9 * d)), 3);

	/* Necessary to avoid taking the log of a negative number later. */
	if(v <= 0)
	continue;

	U = curand_uniform(&state[id]);

	/* Accept the sample under the following condition.
	Otherwise repeat loop. */
	if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
	return d * v / b;
	}
	}


	/*
	* Sample each theta from the appropriate gamma distribution
	*/

	__global__ void sample_theta(curandState *state,
	float theta, float log_theta, int y, float n,
	float a, float b, int K){

	int id = threadIdx.x + blockIdx.x * blockDim.x;
	float hyperA, hyperB;

	if(id < K){
	hyperA = a + y[id];
	hyperB = b + n[id];
	theta[id] = rgamma(state, id, hyperA, hyperB);
	log_theta[id] = log(theta[id]);
	}
	}


	/*
	* Initialize GPU random number generators
	*/

	__global__ void setup_kernel(curandState *state, unsigned int seed, int K){

	int id = threadIdx.x + blockIdx.x * blockDim.x;

	if(id < K)
	curand_init(seed, id, 0, &state[id]);
	}