Permalink
Cannot retrieve contributors at this time
parallel_mcmc/parallel/para_gibbs.cu
Go to file/* | |
Created by Zebulun Arendsee. | |
March 26, 2013 | |
Modified by Will Landau. | |
June 30, 2013 | |
will-landau.com | |
landau@iastate.edu | |
This program implements a MCMC algorithm for the following hierarchical | |
model: | |
y_k ~ Poisson(n_k * theta_k) k = 1, ..., K | |
theta_k ~ Gamma(a, b) | |
a ~ Unif(0, a0) | |
b ~ Unif(0, b0) | |
We let a0 and b0 be arbitrarily large. | |
Arguments: | |
1) input filename | |
With two space delimited columns holding integer values for | |
y and float values for n. | |
2) number of trials (1000 by default) | |
Output: A comma delimited file containing a column for a, b, and each | |
theta. All output is written to stdout. | |
Example dataset: | |
$ head -3 data.txt | |
4 0.91643 | |
23 3.23709 | |
7 0.40103 | |
Example of compilation and execution: | |
$ nvcc gibbs_metropolis.cu -o gibbs | |
$ ./gibbs mydata.txt 2500 > output.csv | |
$ | |
This code borrows from the nVidia developer zone documentation, | |
specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1 | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <cuda.h> | |
#include <math.h> | |
#include <curand_kernel.h> | |
#include <thrust/reduce.h> | |
#include <thrust/device_ptr.h> | |
#include <thrust/device_vector.h> | |
#define PI 3.14159265359f | |
#define THREADS_PER_BLOCK 64 | |
#define CUDA_CALL(x) {if((x) != cudaSuccess){ \ | |
printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \ | |
printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \ | |
exit(EXIT_FAILURE);}} | |
#define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \ | |
printf("Error at %s:%d\n",__FILE__,__LINE__); \ | |
printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \ | |
exit(EXIT_FAILURE);}} | |
__host__ void load_data(int argc, char **argv, int *K, int **y, float **n); | |
__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs); | |
__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum); | |
__host__ float rnorm(); | |
__device__ float rnorm(curandState *state,int id ); | |
__host__ float rgamma(float a, float b); | |
__device__ float rgamma(curandState *state, int id, float a, float b); | |
__global__ void sample_theta(curandState *state, float *theta, float *log_theta, | |
int *y, float *n, float a, float b, int K); | |
__global__ void setup_kernel(curandState *state, unsigned int seed, int); | |
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, | |
float *theta, float *log_theta, | |
float a, float b,float *dev_a_out,float *dev_b_out, | |
int trials); | |
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, | |
float a, float b, int K, curandState *state); | |
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks); | |
__global__ void sampleTdot(int trials, int *tDot,curandState *state); | |
int main(int argc, char **argv){ | |
curandState *devStates; | |
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out; | |
int K, *y, *dev_y, nBlocks,nThreads, trials = 1000; | |
if(argc>4){ | |
trials = atoi(argv[2]); | |
nBlocks = atoi(argv[3]); | |
nThreads = atoi(argv[4]); | |
} | |
else if(argc > 2){ | |
trials = atoi(argv[2]); | |
nBlocks = 64; | |
nThreads = 1; | |
} | |
load_data(argc, argv, &K, &y, &n); | |
/* starting values of hyperparameters */ | |
a = 20; | |
b = 1; | |
/*------ Allocate memory -----------------------------------------*/ | |
CUDA_CALL(cudaMalloc((void **)&dev_y, K * sizeof(int))); | |
CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int), | |
cudaMemcpyHostToDevice)); | |
CUDA_CALL(cudaMalloc((void **)&dev_n, K * sizeof(float))); | |
CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float), | |
cudaMemcpyHostToDevice)); | |
/* Allocate space for theta and log_theta on device and host */ | |
CUDA_CALL(cudaMalloc((void **)&dev_theta, K * sizeof(float))); | |
CUDA_CALL(cudaMalloc((void **)&dev_log_theta, K * sizeof(float))); | |
/* Allocate space for random states on device */ | |
CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); | |
/////////////////////////allocate space on dev memory to save a and b | |
CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float))); | |
CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float))); | |
/*------ Setup random number generators (one per thread) ---------*/ | |
//nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; | |
//nBlocks = 1; | |
setup_kernel<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, 0, K); | |
//printf("%f",&dev_a_out); | |
printf("alpha, beta\n"); | |
//seqMetroProcess<<<nBlocks,1>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); | |
seqMetroProcess<<<nBlocks,nThreads>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials); | |
int *tDot; | |
CUDA_CALL(cudaMalloc((void **)&tDot,trials*sizeof(int))); | |
float *h; | |
CUDA_CALL(cudaMalloc((void **)&h,trials*sizeof(float))); | |
sampleTdot<<<nBlocks,nThreads>>>(trials, tDot,devStates); | |
mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates,nBlocks); | |
/*------ Free Memory -------------------------------------------*/ | |
free(y); | |
free(n); | |
CUDA_CALL(cudaFree(devStates)); | |
CUDA_CALL(cudaFree(dev_theta)); | |
CUDA_CALL(cudaFree(dev_log_theta)); | |
CUDA_CALL(cudaFree(dev_y)); | |
CUDA_CALL(cudaFree(dev_n)); | |
return EXIT_SUCCESS; | |
} | |
__global__ void sampleTdot(int trials, int *tDot,curandState *state){ | |
int id = threadIdx.x + blockIdx.x * blockDim.x; | |
int u = (curand(&state[id]) % (trials-1)) + 1; | |
//printf("thread %d sample: %d",id,u); | |
tDot[id] = u; | |
} | |
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot, curandState *state,int nBlocks){ | |
/* printf("\n all blocks finished\n"); | |
for(int j = 0; j < trials ; j++) { | |
printf(" %f ", *dev_a_out); | |
printf(" %f \n",*dev_b_out); | |
dev_a_out++; | |
dev_b_out++; | |
} | |
*/ | |
printf("\n all blocks finished\n"); | |
for(int j = 0; j < nBlocks*2 ; j++) { | |
// printf(" %f ", *dev_a_out); | |
printf(" %d \n",*tDot); | |
tDot++; | |
//dev_b_out++; | |
} | |
} | |
/* | |
* Sample each theta from the appropriate gamma distribution | |
*/ | |
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, | |
float a, float b, int K, curandState *state){ | |
float hyperA, hyperB; | |
for ( int i = 0; i < K; i++ ){ | |
hyperA = a + y[i]; | |
hyperB = b + n[i]; | |
theta[i] = rgamma(state,i,hyperA, hyperB); | |
log_theta[i] = log(theta[i]); | |
} | |
} | |
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, | |
float *theta, float *log_theta, | |
float a, float b,float *dev_a_out,float *dev_b_out, int trials){ | |
/*------ MCMC ----------------------------------------------------*/ | |
int i; | |
//printf("K: %d \n ",K); | |
int id = threadIdx.x + blockIdx.x * blockDim.x; | |
int nTasks = nBlocks*blockDim.x; | |
//int start = blockIdx.x * K/nBlocks; | |
int start = id * K/nTasks; | |
int lengthPerBlock = K/nTasks; | |
//partition the data | |
int *yy; | |
yy = y+start; | |
float *nn; | |
nn = n+start; | |
float *sTheta; | |
sTheta = theta+start; | |
float *sLogTheta; | |
sLogTheta = log_theta+start; | |
/* | |
printf("length per block:%d\n",lengthPerBlock); | |
printf("start is:%d\n",start); | |
printf("partial data under block id: %d \n ",blockIdx.x); | |
for(int j = 0; j < lengthPerBlock ; j++) { | |
printf(" %d \n ", *yy); | |
yy++; | |
} | |
*/ | |
/* Steps of MCMC */ | |
int rstart = id*trials/nTasks; | |
int rlenPerTask = trials; | |
float *subA; | |
subA = dev_a_out+rstart; | |
float *subB; | |
subB = dev_b_out+rstart; | |
for(i = 0; i < rlenPerTask; i++){ | |
//sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K); | |
sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state); | |
float flat_sum=0; | |
for(int ii=0;ii<lengthPerBlock;ii++){ | |
//*ptr refers to the value at address | |
flat_sum = flat_sum + *sTheta; | |
sTheta++; | |
} | |
float sum_logs=0; | |
for(int ii=0;ii<lengthPerBlock;ii++){ | |
//*ptr refers to the value at address | |
sum_logs = sum_logs + *sLogTheta; | |
sLogTheta++; | |
} | |
//printf("sum of theta is %f\n",flat_sum); | |
//printf("sum of logtheta is %f\n",sum_logs); | |
/* Sample hyperparameters. */ | |
a = sample_a(state,id, a, b, lengthPerBlock, sum_logs); | |
b = sample_b(state,id, a, lengthPerBlock, flat_sum); | |
/* print hyperparameters. */ | |
//printf("%f, %f\n", a, b); | |
////////////save new output to the global array | |
*subA = a; | |
subA++; | |
*subB = b; | |
subB++; | |
} | |
} | |
/* | |
* Read in data. | |
*/ | |
__host__ void load_data(int argc, char **argv, int *K, int **y, float **n){ | |
int k; | |
char line[128]; | |
FILE *fp; | |
if(argc > 1){ | |
fp = fopen(argv[1], "r"); | |
} else { | |
printf("Please provide input filename\n"); | |
exit(EXIT_FAILURE); | |
} | |
if(fp == NULL){ | |
printf("Cannot read file \n"); | |
exit(EXIT_FAILURE); | |
} | |
*K = 0; | |
while( fgets (line, sizeof line, fp) != NULL ) | |
(*K)++; | |
rewind(fp); | |
*y = (int*) malloc((*K) * sizeof(int)); | |
*n = (float*) malloc((*K) * sizeof(float)); | |
for(k = 0; k < *K; k++) | |
fscanf(fp, "%d %f", *y + k, *n + k); | |
fclose(fp); | |
} | |
/* | |
* Metropolis algorithm for producing random a values. | |
* The proposal distribution in normal with a variance that | |
* is adjusted at each step. | |
*/ | |
__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs){ | |
float sigma = 2.0; | |
float norm = curand_normal(&state[id]); | |
float U, log_acceptance_ratio, proposal = norm * sigma + a; | |
//printf("rnorm result:%f \n",norm*sigma); | |
if(proposal <= 0) | |
return a; | |
log_acceptance_ratio = (proposal - a) * sum_logs + | |
K * (proposal - a) * log(b) - | |
K * (lgamma(proposal) - lgamma(a)); | |
U = curand_normal(&state[id]); | |
//printf("log_acceptance result:%f \n",log_acceptance_ratio); | |
//printf("log U result:%f \n",log(U)); | |
if(log(U) < log_acceptance_ratio){ | |
sigma *= 1.1; | |
return proposal; | |
} else { | |
sigma /= 1.1; | |
return a; | |
} | |
} | |
/* | |
* Sample b from a gamma distribution. | |
*/ | |
__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum){ | |
float hyperA = K * a + 1; | |
float hyperB = flat_sum; | |
return rgamma(state,id,hyperA, hyperB); | |
} | |
/* | |
* Box-Muller Transformation: Generate one standard normal variable. | |
* | |
* This algorithm can be more efficiently used by producing two | |
* random normal variables. However, for the CPU, much faster | |
* algorithms are possible (e.g. the Ziggurat Algorithm); | |
* | |
* This is actually the algorithm chosen by NVIDIA to calculate | |
* normal random variables on the GPU. | |
*/ | |
__host__ float rnorm(){ | |
float U1 = rand() / float(RAND_MAX); | |
float U2 = rand() / float(RAND_MAX); | |
/* printf("host U1 result:%f \n",U1); printf("host U2 result:%f \n",U2); | |
printf("host RANDMAX result:%f \n",RAND_MAX); | |
*/ | |
float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2); | |
/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */ | |
return V1; | |
} | |
__device__ float rnorm(curandState *state,int id){ | |
float U1 = curand_uniform(&state[id]); | |
/* printf("U1 result:%f \n",U1); | |
printf("RANDMAX result:%f \n",RAND_MAX); | |
*/ | |
float U2 = curand_uniform(&state[id]); | |
//printf("U2 result:%f \n",U2); | |
float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2); | |
/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */ | |
return V1; | |
} | |
/* | |
* See device rgamma function. This is probably not the | |
* fastest way to generate random gamma variables on a CPU. | |
*/ | |
__host__ float rgamma(float a, float b){ | |
float d = a - 1.0 / 3; | |
float Y, U, v; | |
while(1){ | |
Y = rnorm(); | |
v = pow((1 + Y / sqrt(9 * d)), 3); | |
// Necessary to avoid taking the log of a negative number later. | |
if(v <= 0) | |
continue; | |
U = rand() / float(RAND_MAX); | |
// Accept the sample under the following condition. | |
// Otherwise repeat loop. | |
if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) | |
return d * v / b; | |
} | |
} | |
/* | |
* Generate a single Gamma distributed random variable by the Marsoglia | |
* algorithm (George Marsaglia, Wai Wan Tsang; 2001). | |
* | |
* Zeb chose this algorithm because it has a very high acceptance rate (>96%), | |
* so this while loop will usually only need to run a few times. Many other | |
* algorithms, while perhaps faster on a CPU, have acceptance rates on the | |
* order of 50% (very bad in a massively parallel context). | |
*/ | |
__device__ float rgamma(curandState *state, int id, float a, float b){ | |
float d = a - 1.0 / 3; | |
float Y, U, v; | |
while(1){ | |
Y = curand_normal(&state[id]); | |
v = pow((1 + Y / sqrt(9 * d)), 3); | |
/* Necessary to avoid taking the log of a negative number later. */ | |
if(v <= 0) | |
continue; | |
U = curand_uniform(&state[id]); | |
/* Accept the sample under the following condition. | |
Otherwise repeat loop. */ | |
if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) | |
return d * v / b; | |
} | |
} | |
/* | |
* Sample each theta from the appropriate gamma distribution | |
*/ | |
__global__ void sample_theta(curandState *state, | |
float *theta, float *log_theta, int *y, float *n, | |
float a, float b, int K){ | |
int id = threadIdx.x + blockIdx.x * blockDim.x; | |
float hyperA, hyperB; | |
if(id < K){ | |
hyperA = a + y[id]; | |
hyperB = b + n[id]; | |
theta[id] = rgamma(state, id, hyperA, hyperB); | |
log_theta[id] = log(theta[id]); | |
} | |
} | |
/* | |
* Initialize GPU random number generators | |
*/ | |
__global__ void setup_kernel(curandState *state, unsigned int seed, int K){ | |
int id = threadIdx.x + blockIdx.x * blockDim.x; | |
if(id < K) | |
curand_init(seed, id, 0, &state[id]); | |
} |