Skip to content
Permalink
27b72ed6bc
Go to file
 
 
Cannot retrieve contributors at this time
585 lines (447 sloc) 16.6 KB
/*
Created by Zebulun Arendsee.
March 26, 2013
Modified by Will Landau.
June 30, 2013
will-landau.com
landau@iastate.edu
This program implements a MCMC algorithm for the following hierarchical
model:
y_k ~ Poisson(n_k * theta_k) k = 1, ..., K
theta_k ~ Gamma(a, b)
a ~ Unif(0, a0)
b ~ Unif(0, b0)
We let a0 and b0 be arbitrarily large.
Arguments:
1) input filename
With two space delimited columns holding integer values for
y and float values for n.
2) number of trials (1000 by default)
Output: A comma delimited file containing a column for a, b, and each
theta. All output is written to stdout.
Example dataset:
$ head -3 data.txt
4 0.91643
23 3.23709
7 0.40103
Example of compilation and execution:
$ nvcc gibbs_metropolis.cu -o gibbs
$ ./gibbs mydata.txt 2500 > output.csv
$
This code borrows from the nVidia developer zone documentation,
specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1
*/
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <curand_kernel.h>
#include <thrust/reduce.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#define PI 3.14159265359f
#define THREADS_PER_BLOCK 64
#define CUDA_CALL(x) {if((x) != cudaSuccess){ \
printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \
printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \
exit(EXIT_FAILURE);}}
#define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n",__FILE__,__LINE__); \
printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \
exit(EXIT_FAILURE);}}
__host__ void load_data(int argc, char **argv, int *K, int **y, float **n);
__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs);
__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum);
__host__ float rnorm();
__device__ float rnorm(curandState *state,int id );
__host__ float rgamma(float a, float b);
__device__ float rgamma(curandState *state, int id, float a, float b);
__global__ void sample_theta(curandState *state, float *theta, float *log_theta,
int *y, float *n, float a, float b, int K);
__global__ void setup_kernel(curandState *state, unsigned int seed, int);
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state,
float *theta, float *log_theta,
float a, float b,float *dev_a_out,float *dev_b_out,
int trials);
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n,
float a, float b, int K, curandState *state);
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results);
__global__ void sampleTdot(int trials, int *tDot,curandState *state);
__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads);
__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h);
__device__ float normPDF(float x, float mean, float sigma);
__device__ float general_normoral(curandState *state, float mean, float variance);
int main(int argc, char **argv){
curandState *devStates;
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out, *results;
int K, *y, *dev_y, nBlocks,nThreads, trials = 1000;
if(argc>4){
trials = atoi(argv[2]);
nBlocks = atoi(argv[3]);
nThreads = atoi(argv[4]);
}
else if(argc > 2){
trials = atoi(argv[2]);
nBlocks = 64;
nThreads = 1;
}
load_data(argc, argv, &K, &y, &n);
/* starting values of hyperparameters */
a = 20;
b = 1;
/*------ Allocate memory -----------------------------------------*/
CUDA_CALL(cudaMalloc((void **)&dev_y, K * sizeof(int)));
CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int),
cudaMemcpyHostToDevice));
CUDA_CALL(cudaMalloc((void **)&dev_n, K * sizeof(float)));
CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float),
cudaMemcpyHostToDevice));
/* Allocate space for theta and log_theta on device and host */
CUDA_CALL(cudaMalloc((void **)&dev_theta, K * sizeof(float)));
CUDA_CALL(cudaMalloc((void **)&dev_log_theta, K * sizeof(float)));
/* Allocate space for random states on device */
CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState)));
/////////////////////////allocate space on dev memory to save a and b
CUDA_CALL(cudaMalloc((void **)&dev_a_out, nBlocks*nThreads*trials*sizeof(float)));
CUDA_CALL(cudaMalloc((void **)&dev_b_out, nBlocks*nThreads*trials*sizeof(float)));
CUDA_CALL(cudaMalloc((void **)&results, nThreads*sizeof(float)));
/*------ Setup random number generators (one per thread) ---------*/
//nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
//nBlocks = 1;
setup_kernel<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, 0, K);
//printf("%f",&dev_a_out);
printf("alpha, beta\n");
//seqMetroProcess<<<nBlocks,1>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials);
seqMetroProcess<<<nBlocks,nThreads>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials);
int *tDot;
CUDA_CALL(cudaMalloc((void **)&tDot,nThreads*nBlocks*sizeof(int)));
float *h;
CUDA_CALL(cudaMalloc((void **)&h,trials*sizeof(float)));
sampleTdot<<<nBlocks,nThreads>>>(trials, tDot,devStates);
//int numThreads = trials / nBlocks; // TODO: This should be a whole number
mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads, h, results);
/*------ Free Memory -------------------------------------------*/
free(y);
free(n);
CUDA_CALL(cudaFree(devStates));
CUDA_CALL(cudaFree(dev_theta));
CUDA_CALL(cudaFree(dev_log_theta));
CUDA_CALL(cudaFree(dev_y));
CUDA_CALL(cudaFree(dev_n));
/* CUDA_CALL(cudaFree(results));
CUDA_CALL(cudaFree(h));
CUDA_CALL(cudaFree(tDot));
CUDA_CALL(cudaFree(dev_a_out));
CUDA_CALL(cudaFree(dev_b_out));
*/
return EXIT_SUCCESS;
}
__global__ void sampleTdot(int trials, int *tDot,curandState *state){
int id = threadIdx.x + blockIdx.x * blockDim.x;
int u = (curand(&state[id]) % (trials-1)) + 1;
//printf("thread %d sample: %d",id,u);
tDot[id] = u;
}
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results){
// printf("\n all blocks finished\n");
// for(int j = 0; j < nBlocks*2 ; j++) {
// // printf(" %f ", *dev_a_out);
// printf(" %d \n",*tDot);
// tDot++;
// //dev_b_out++;
// }
int M = nBlocks * nThreads;
int id = threadIdx.x + blockIdx.x * blockDim.x;
for (int i=0; i < trials; i++) {
h[i] = powf(i,(-1/(4+2)));
for (int m=0; m < M; m++) {
int *cDot = (int*) malloc(M*sizeof(int));
printf("%d\n",m);
memcpy(cDot, tDot, sizeof(int) * M);
cDot[m] = (curand(&state[id]) % (trials-1)) + 1;
int u = curand_uniform(&state[id]);
float wcDot = computeW(cDot, dev_a_out, dev_b_out, M, trials, h[i]);
float wtDot = computeW(tDot, dev_a_out, dev_b_out, M, trials, h[i]);
if (u < (wcDot/ wtDot) ){
memcpy(tDot, cDot, sizeof(int) * trials);
}
free(cDot);
}
// TODO: Draw from Multivariate Normal, and Save into the results
float posterior_mean_a = posteriorMean(tDot, dev_a_out, M, trials);
float posterior_mean_b = posteriorMean(tDot, dev_b_out, M, trials);
float variance = (h[i]*h[i])/M;
printf("%d\n",trials);
printf("%f, %f\n" , general_normoral(state, posterior_mean_a, variance), general_normoral(state, posterior_mean_b, variance));
}
}
__device__ float general_normoral(curandState *state, float mean, float variance) {
int id = threadIdx.x + blockIdx.x * blockDim.x;
return curand_normal(&state[id]) * variance + mean;
}
__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) {
float sum = 0;
for (int i=0; i < M; i++) {
int index = tDot[i] + i * nThreads; // trial m of posterior m (note: i = blockId)
sum += dev_x_out[index]; // posterior_m_tm
}
float mean = sum/M;
return mean;
}
__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h) {
float posterior_mean_x = posteriorMean(tDot, dev_x_out, M, nThreads);
float posterior_mean_y = posteriorMean(tDot, dev_y_out, M, nThreads);
float product = 1;
float posterior_m_tm_x;
float posterior_m_tm_y;
for (int i=0; i < M; i++) {
int index = tDot[i] + i * nThreads; // trial m of posterior m (note: i = blockId)
posterior_m_tm_x = dev_x_out[index]; // posterior_m_tm of x
posterior_m_tm_y = dev_y_out[index]; // posterio_m_tm_ of y
product *= normPDF(posterior_m_tm_x, posterior_mean_x, h*h) * normPDF(posterior_m_tm_y, posterior_mean_y, h*h);
}
return product;
}
__device__ float normPDF(float x, float mean, float variance) {
float denominator = sqrtf(2*PI*(variance*variance));
float numerator = expf( -1 * (x-mean)*(x-mean) / (2*variance*variance) );
return numerator/denominator;
}
/*
* Sample each theta from the appropriate gamma distribution
*/
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n,
float a, float b, int K, curandState *state){
float hyperA, hyperB;
for ( int i = 0; i < K; i++ ){
hyperA = a + y[i];
hyperB = b + n[i];
theta[i] = rgamma(state,i,hyperA, hyperB);
log_theta[i] = log(theta[i]);
}
}
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state,
float *theta, float *log_theta,
float a, float b,float *dev_a_out,float *dev_b_out, int trials){
/*------ MCMC ----------------------------------------------------*/
int i;
//printf("K: %d \n ",K);
int id = threadIdx.x + blockIdx.x * blockDim.x;
int nTasks = nBlocks*blockDim.x;
//int start = blockIdx.x * K/nBlocks;
int start = id * K/nTasks;
int lengthPerBlock = K/nTasks;
//partition the data
int *yy;
yy = y+start;
float *nn;
nn = n+start;
float *sTheta;
sTheta = theta+start;
float *sLogTheta;
sLogTheta = log_theta+start;
/*
printf("length per block:%d\n",lengthPerBlock);
printf("start is:%d\n",start);
printf("partial data under block id: %d \n ",blockIdx.x);
for(int j = 0; j < lengthPerBlock ; j++) {
printf(" %d \n ", *yy);
yy++;
}
*/
/* Steps of MCMC */
int rstart = id*trials/nTasks;
int rlenPerTask = trials;
float *subA;
subA = dev_a_out+rstart;
float *subB;
subB = dev_b_out+rstart;
for(i = 0; i < rlenPerTask; i++){
//sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K);
sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state);
float flat_sum=0;
for(int ii=0;ii<lengthPerBlock;ii++){
//*ptr refers to the value at address
flat_sum = flat_sum + *sTheta;
sTheta++;
}
float sum_logs=0;
for(int ii=0;ii<lengthPerBlock;ii++){
//*ptr refers to the value at address
sum_logs = sum_logs + *sLogTheta;
sLogTheta++;
}
//printf("sum of theta is %f\n",flat_sum);
//printf("sum of logtheta is %f\n",sum_logs);
/* Sample hyperparameters. */
a = sample_a(state,id, a, b, lengthPerBlock, sum_logs);
b = sample_b(state,id, a, lengthPerBlock, flat_sum);
/* print hyperparameters. */
//printf("%f, %f\n", a, b);
////////////save new output to the global array
*subA = a;
subA++;
*subB = b;
subB++;
}
}
/*
* Read in data.
*/
__host__ void load_data(int argc, char **argv, int *K, int **y, float **n){
int k;
char line[128];
FILE *fp;
if(argc > 1){
fp = fopen(argv[1], "r");
} else {
printf("Please provide input filename\n");
exit(EXIT_FAILURE);
}
if(fp == NULL){
printf("Cannot read file \n");
exit(EXIT_FAILURE);
}
*K = 0;
while( fgets (line, sizeof line, fp) != NULL )
(*K)++;
rewind(fp);
*y = (int*) malloc((*K) * sizeof(int));
*n = (float*) malloc((*K) * sizeof(float));
for(k = 0; k < *K; k++)
fscanf(fp, "%d %f", *y + k, *n + k);
fclose(fp);
}
/*
* Metropolis algorithm for producing random a values.
* The proposal distribution in normal with a variance that
* is adjusted at each step.
*/
__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs){
float sigma = 2.0;
float norm = curand_normal(&state[id]);
float U, log_acceptance_ratio, proposal = norm * sigma + a;
//printf("rnorm result:%f \n",norm*sigma);
if(proposal <= 0)
return a;
log_acceptance_ratio = (proposal - a) * sum_logs +
K * (proposal - a) * log(b) -
K * (lgamma(proposal) - lgamma(a));
U = curand_normal(&state[id]);
//printf("log_acceptance result:%f \n",log_acceptance_ratio);
//printf("log U result:%f \n",log(U));
if(log(U) < log_acceptance_ratio){
sigma *= 1.1;
return proposal;
} else {
sigma /= 1.1;
return a;
}
}
/*
* Sample b from a gamma distribution.
*/
__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum){
float hyperA = K * a + 1;
float hyperB = flat_sum;
return rgamma(state,id,hyperA, hyperB);
}
/*
* Box-Muller Transformation: Generate one standard normal variable.
*
* This algorithm can be more efficiently used by producing two
* random normal variables. However, for the CPU, much faster
* algorithms are possible (e.g. the Ziggurat Algorithm);
*
* This is actually the algorithm chosen by NVIDIA to calculate
* normal random variables on the GPU.
*/
__host__ float rnorm(){
float U1 = rand() / float(RAND_MAX);
float U2 = rand() / float(RAND_MAX);
/* printf("host U1 result:%f \n",U1); printf("host U2 result:%f \n",U2);
printf("host RANDMAX result:%f \n",RAND_MAX);
*/
float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
return V1;
}
__device__ float rnorm(curandState *state,int id){
float U1 = curand_uniform(&state[id]);
/* printf("U1 result:%f \n",U1);
printf("RANDMAX result:%f \n",RAND_MAX);
*/
float U2 = curand_uniform(&state[id]);
//printf("U2 result:%f \n",U2);
float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2);
/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */
return V1;
}
/*
* See device rgamma function. This is probably not the
* fastest way to generate random gamma variables on a CPU.
*/
__host__ float rgamma(float a, float b){
float d = a - 1.0 / 3;
float Y, U, v;
while(1){
Y = rnorm();
v = pow((1 + Y / sqrt(9 * d)), 3);
// Necessary to avoid taking the log of a negative number later.
if(v <= 0)
continue;
U = rand() / float(RAND_MAX);
// Accept the sample under the following condition.
// Otherwise repeat loop.
if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
return d * v / b;
}
}
/*
* Generate a single Gamma distributed random variable by the Marsoglia
* algorithm (George Marsaglia, Wai Wan Tsang; 2001).
*
* Zeb chose this algorithm because it has a very high acceptance rate (>96%),
* so this while loop will usually only need to run a few times. Many other
* algorithms, while perhaps faster on a CPU, have acceptance rates on the
* order of 50% (very bad in a massively parallel context).
*/
__device__ float rgamma(curandState *state, int id, float a, float b){
float d = a - 1.0 / 3;
float Y, U, v;
while(1){
Y = curand_normal(&state[id]);
v = pow((1 + Y / sqrt(9 * d)), 3);
/* Necessary to avoid taking the log of a negative number later. */
if(v <= 0)
continue;
U = curand_uniform(&state[id]);
/* Accept the sample under the following condition.
Otherwise repeat loop. */
if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v)))
return d * v / b;
}
}
/*
* Sample each theta from the appropriate gamma distribution
*/
__global__ void sample_theta(curandState *state,
float *theta, float *log_theta, int *y, float *n,
float a, float b, int K){
int id = threadIdx.x + blockIdx.x * blockDim.x;
float hyperA, hyperB;
if(id < K){
hyperA = a + y[id];
hyperB = b + n[id];
theta[id] = rgamma(state, id, hyperA, hyperB);
log_theta[id] = log(theta[id]);
}
}
/*
* Initialize GPU random number generators
*/
__global__ void setup_kernel(curandState *state, unsigned int seed, int K){
int id = threadIdx.x + blockIdx.x * blockDim.x;
if(id < K)
curand_init(seed, id, 0, &state[id]);
}
You can’t perform that action at this time.