Skip to content
Permalink
Browse files

A version that could run! lol

  • Loading branch information
yuz12012 committed Apr 20, 2017
1 parent 81d0a83 commit fd678a1ce31fa07e6dbcee8d7b34758ac7f87342
Showing with 49 additions and 26 deletions.
  1. +49 −26 parallel/para_gibbs.cu
@@ -67,10 +67,10 @@ specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1

__host__ void load_data(int argc, char **argv, int *K, int **y, float **n);

__host__ float sample_a(float a, float b, int K, float sum_logs);
__host__ float sample_b(float a, int K, float flat_sum);

__device__ float sample_a(curandState *state, float a, float b, int K, float sum_logs);
__device__ float sample_b(curandState *state, float a, int K, float flat_sum);
__host__ float rnorm();
__device__ float rnorm(curandState *state );
__host__ float rgamma(float a, float b);

__device__ float rgamma(curandState *state, int id, float a, float b);
@@ -89,8 +89,8 @@ __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *
int main(int argc, char **argv){

curandState *devStates;
float a, b, flat_sum, sum_logs, *n, *dev_n, *dev_theta, *dev_log_theta;
int i, K, *y, *dev_y, nBlocks, trials = 1000;
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta;
int K, *y, *dev_y, nBlocks, trials = 1000;

if(argc > 2)
trials = atoi(argv[2]);
@@ -167,14 +167,19 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat
int start = blockIdx.x * K/nBlocks;
int lengthPerBlock = K/nBlocks;
//partition the data
int *yy = &y[start];
float *nn = &n[start];
float *sTheta = &theta[start];
float *sLogTheta = &log_theta[start];
int *yy;
yy = &y[start];
float *nn;
nn = &n[start];

float *sTheta;
sTheta = &theta[start];
float *sLogTheta;
sLogTheta = &log_theta[start];

printf("block id:%d\n",blockIdx.x);
for(int j = 0; j < lengthPerBlock ; j++) {
printf("%d ", yy[j]);
printf("%d\n ", yy[j]);
}
printf("alpha, beta\n");

@@ -185,18 +190,26 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat

for(i = 0; i < trials; i++){
//sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K);
sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, K, state);
/* Make iterators for thetas and log thetas. */
// thrust::device_ptr<float> theta(dev_theta);
// thrust::device_ptr<float> log_theta(dev_log_theta);
sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state);

/* Compute pairwise sums of thetas and log_thetas. */
// flat_sum = thrust::reduce(theta, theta + K);
// sum_logs = thrust::reduce(log_theta, log_theta + K);

float flat_sum=0;
for(int ii=0;ii<lengthPerBlock;ii++){
//*ptr refers to the value at address
flat_sum = flat_sum + *sTheta;
sTheta++;
}

float sum_logs=0;
for(int ii=0;ii<lengthPerBlock;ii++){
//*ptr refers to the value at address
sum_logs = sum_logs + *sLogTheta;
sLogTheta++;
}


/* Sample hyperparameters. */
// a = sample_a(a, b, K, sum_logs);
// b = sample_b(a, K, flat_sum);
a = sample_a(state, a, b, lengthPerBlock, sum_logs);
b = sample_b(state, a, lengthPerBlock, flat_sum);

/* print hyperparameters. */
printf("%f, %f\n", a, b);
@@ -247,10 +260,10 @@ __host__ void load_data(int argc, char **argv, int *K, int **y, float **n){
* is adjusted at each step.
*/

__host__ float sample_a(float a, float b, int K, float sum_logs){
__device__ float sample_a(curandState *state, float a, float b, int K, float sum_logs){

static float sigma = 2;
float U, log_acceptance_ratio, proposal = rnorm() * sigma + a;
float U, log_acceptance_ratio, proposal = rnorm(state) * sigma + a;

if(proposal <= 0)
return a;
@@ -259,7 +272,7 @@ __host__ float sample_a(float a, float b, int K, float sum_logs){
K * (proposal - a) * log(b) -
K * (lgamma(proposal) - lgamma(a));

U = rand() / float(RAND_MAX);
U = curand(state) / float(RAND_MAX);

if(log(U) < log_acceptance_ratio){
sigma *= 1.1;
@@ -275,11 +288,11 @@ __host__ float sample_a(float a, float b, int K, float sum_logs){
* Sample b from a gamma distribution.
*/

__host__ float sample_b(float a, int K, float flat_sum){
__device__ float sample_b(curandState *state, float a, int K, float flat_sum){

float hyperA = K * a + 1;
float hyperB = flat_sum;
return rgamma(hyperA, hyperB);
return rgamma(state,0,hyperA, hyperB);
}


@@ -293,7 +306,7 @@ __host__ float sample_b(float a, int K, float flat_sum){
* This is actually the algorithm chosen by NVIDIA to calculate
* normal random variables on the GPU.
*/

__host__ float rnorm(){

float U1 = rand() / float(RAND_MAX);
@@ -303,6 +316,16 @@ __host__ float rnorm(){
return V1;
}


__device__ float rnorm(curandState *state){

float U1 = curand(state) / float(RAND_MAX);
float U2 = curand(state) / float(RAND_MAX);
float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
return V1;
}


/*
* See device rgamma function. This is probably not the

0 comments on commit fd678a1

Please sign in to comment.
You can’t perform that action at this time.