Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
parallel_mcmc/parallel/para_gibbs.cu
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
452 lines (338 sloc)
11.9 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Created by Zebulun Arendsee. | |
March 26, 2013 | |
Modified by Will Landau. | |
June 30, 2013 | |
will-landau.com | |
landau@iastate.edu | |
This program implements a MCMC algorithm for the following hierarchical | |
model: | |
y_k ~ Poisson(n_k * theta_k) k = 1, ..., K | |
theta_k ~ Gamma(a, b) | |
a ~ Unif(0, a0) | |
b ~ Unif(0, b0) | |
We let a0 and b0 be arbitrarily large. | |
Arguments: | |
1) input filename | |
With two space delimited columns holding integer values for | |
y and float values for n. | |
2) number of trials (1000 by default) | |
Output: A comma delimited file containing a column for a, b, and each | |
theta. All output is written to stdout. | |
Example dataset: | |
$ head -3 data.txt | |
4 0.91643 | |
23 3.23709 | |
7 0.40103 | |
Example of compilation and execution: | |
$ nvcc gibbs_metropolis.cu -o gibbs | |
$ ./gibbs mydata.txt 2500 > output.csv | |
$ | |
This code borrows from the nVidia developer zone documentation, | |
specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1 | |
*/ | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <cuda.h> | |
#include <math.h> | |
#include <curand_kernel.h> | |
#include <thrust/reduce.h> | |
#include <thrust/device_ptr.h> | |
#include <thrust/device_vector.h> | |
#define PI 3.14159265359f | |
#define THREADS_PER_BLOCK 64 | |
#define CUDA_CALL(x) {if((x) != cudaSuccess){ \ | |
printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \ | |
printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \ | |
exit(EXIT_FAILURE);}} | |
#define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \ | |
printf("Error at %s:%d\n",__FILE__,__LINE__); \ | |
printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \ | |
exit(EXIT_FAILURE);}} | |
__host__ void load_data(int argc, char **argv, int *K, int **y, float **n); | |
__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs); | |
__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum); | |
__host__ float rnorm(); | |
__device__ float rnorm(curandState *state,int id ); | |
__host__ float rgamma(float a, float b); | |
__device__ float rgamma(curandState *state, int id, float a, float b); | |
__global__ void sample_theta(curandState *state, float *theta, float *log_theta, | |
int *y, float *n, float a, float b, int K); | |
__global__ void setup_kernel(curandState *state, unsigned int seed, int); | |
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, | |
float *theta, float *log_theta, | |
float a, float b, int trials); | |
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, | |
float a, float b, int K, curandState *state); | |
__global__ void mergePosterior(); | |
int main(int argc, char **argv){ | |
curandState *devStates; | |
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta; | |
int K, *y, *dev_y, nBlocks, trials = 1000; | |
if(argc > 2) | |
trials = atoi(argv[2]); | |
load_data(argc, argv, &K, &y, &n); | |
/* starting values of hyperparameters */ | |
a = 20; | |
b = 1; | |
/*------ Allocate memory -----------------------------------------*/ | |
CUDA_CALL(cudaMalloc((void **)&dev_y, K * sizeof(int))); | |
CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int), | |
cudaMemcpyHostToDevice)); | |
CUDA_CALL(cudaMalloc((void **)&dev_n, K * sizeof(float))); | |
CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float), | |
cudaMemcpyHostToDevice)); | |
/* Allocate space for theta and log_theta on device and host */ | |
CUDA_CALL(cudaMalloc((void **)&dev_theta, K * sizeof(float))); | |
CUDA_CALL(cudaMalloc((void **)&dev_log_theta, K * sizeof(float))); | |
/* Allocate space for random states on device */ | |
CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); | |
/*------ Setup random number generators (one per thread) ---------*/ | |
//nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; | |
nBlocks = 1; | |
setup_kernel<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, 0, K); | |
rnorm(); | |
printf("alpha, beta\n"); | |
//seqMetroProcess<<<nBlocks,1>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); | |
seqMetroProcess<<<nBlocks,128>>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); | |
//mergePosterior<<<nBlocks,1>>>(); | |
/*------ Free Memory -------------------------------------------*/ | |
free(y); | |
free(n); | |
CUDA_CALL(cudaFree(devStates)); | |
CUDA_CALL(cudaFree(dev_theta)); | |
CUDA_CALL(cudaFree(dev_log_theta)); | |
CUDA_CALL(cudaFree(dev_y)); | |
CUDA_CALL(cudaFree(dev_n)); | |
return EXIT_SUCCESS; | |
} | |
__global__ void mergePosterior(){ | |
printf("\n all blocks finished\n"); | |
} | |
/* | |
* Sample each theta from the appropriate gamma distribution | |
*/ | |
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, | |
float a, float b, int K, curandState *state){ | |
float hyperA, hyperB; | |
for ( int i = 0; i < K; i++ ){ | |
hyperA = a + y[i]; | |
hyperB = b + n[i]; | |
theta[i] = rgamma(state,i,hyperA, hyperB); | |
log_theta[i] = log(theta[i]); | |
} | |
} | |
__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, | |
float *theta, float *log_theta, | |
float a, float b, int trials){ | |
/*------ MCMC ----------------------------------------------------*/ | |
int i; | |
//printf("K: %d \n ",K); | |
int id = threadIdx.x + blockIdx.x * blockDim.x; | |
int nTasks = nBlocks*blockDim.x; | |
//int start = blockIdx.x * K/nBlocks; | |
int start = id * K/nTasks; | |
int lengthPerBlock = K/nTasks; | |
//partition the data | |
int *yy; | |
yy = y+start; | |
float *nn; | |
nn = n+start; | |
float *sTheta; | |
sTheta = theta+start; | |
float *sLogTheta; | |
sLogTheta = log_theta+start; | |
/* | |
printf("length per block:%d\n",lengthPerBlock); | |
printf("start is:%d\n",start); | |
printf("partial data under block id: %d \n ",blockIdx.x); | |
for(int j = 0; j < lengthPerBlock ; j++) { | |
printf(" %d \n ", *yy); | |
yy++; | |
} | |
*/ | |
/* Steps of MCMC */ | |
for(i = 0; i < trials; i++){ | |
//sample_theta<<<nBlocks, THREADS_PER_BLOCK>>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K); | |
sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state); | |
float flat_sum=0; | |
for(int ii=0;ii<lengthPerBlock;ii++){ | |
//*ptr refers to the value at address | |
flat_sum = flat_sum + *sTheta; | |
sTheta++; | |
} | |
float sum_logs=0; | |
for(int ii=0;ii<lengthPerBlock;ii++){ | |
//*ptr refers to the value at address | |
sum_logs = sum_logs + *sLogTheta; | |
sLogTheta++; | |
} | |
//printf("sum of theta is %f\n",flat_sum); | |
//printf("sum of logtheta is %f\n",sum_logs); | |
/* Sample hyperparameters. */ | |
a = sample_a(state,threadIdx.x, a, b, lengthPerBlock, sum_logs); | |
b = sample_b(state,threadIdx.x, a, lengthPerBlock, flat_sum); | |
/* print hyperparameters. */ | |
printf("%f, %f\n", a, b); | |
} | |
} | |
/* | |
* Read in data. | |
*/ | |
__host__ void load_data(int argc, char **argv, int *K, int **y, float **n){ | |
int k; | |
char line[128]; | |
FILE *fp; | |
if(argc > 1){ | |
fp = fopen(argv[1], "r"); | |
} else { | |
printf("Please provide input filename\n"); | |
exit(EXIT_FAILURE); | |
} | |
if(fp == NULL){ | |
printf("Cannot read file \n"); | |
exit(EXIT_FAILURE); | |
} | |
*K = 0; | |
while( fgets (line, sizeof line, fp) != NULL ) | |
(*K)++; | |
rewind(fp); | |
*y = (int*) malloc((*K) * sizeof(int)); | |
*n = (float*) malloc((*K) * sizeof(float)); | |
for(k = 0; k < *K; k++) | |
fscanf(fp, "%d %f", *y + k, *n + k); | |
fclose(fp); | |
} | |
/* | |
* Metropolis algorithm for producing random a values. | |
* The proposal distribution in normal with a variance that | |
* is adjusted at each step. | |
*/ | |
__device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs){ | |
float sigma = 2.0; | |
float norm = curand_normal(&state[id]); | |
float U, log_acceptance_ratio, proposal = norm * sigma + a; | |
//printf("rnorm result:%f \n",norm*sigma); | |
if(proposal <= 0) | |
return a; | |
log_acceptance_ratio = (proposal - a) * sum_logs + | |
K * (proposal - a) * log(b) - | |
K * (lgamma(proposal) - lgamma(a)); | |
U = curand_normal(&state[id]); | |
//printf("log_acceptance result:%f \n",log_acceptance_ratio); | |
//printf("log U result:%f \n",log(U)); | |
if(log(U) < log_acceptance_ratio){ | |
sigma *= 1.1; | |
return proposal; | |
} else { | |
sigma /= 1.1; | |
return a; | |
} | |
} | |
/* | |
* Sample b from a gamma distribution. | |
*/ | |
__device__ float sample_b(curandState *state,int id, float a, int K, float flat_sum){ | |
float hyperA = K * a + 1; | |
float hyperB = flat_sum; | |
return rgamma(state,id,hyperA, hyperB); | |
} | |
/* | |
* Box-Muller Transformation: Generate one standard normal variable. | |
* | |
* This algorithm can be more efficiently used by producing two | |
* random normal variables. However, for the CPU, much faster | |
* algorithms are possible (e.g. the Ziggurat Algorithm); | |
* | |
* This is actually the algorithm chosen by NVIDIA to calculate | |
* normal random variables on the GPU. | |
*/ | |
__host__ float rnorm(){ | |
float U1 = rand() / float(RAND_MAX); | |
float U2 = rand() / float(RAND_MAX); | |
/* printf("host U1 result:%f \n",U1); printf("host U2 result:%f \n",U2); | |
printf("host RANDMAX result:%f \n",RAND_MAX); | |
*/ | |
float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2); | |
/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */ | |
return V1; | |
} | |
__device__ float rnorm(curandState *state,int id){ | |
float U1 = curand_uniform(&state[id]); | |
/* printf("U1 result:%f \n",U1); | |
printf("RANDMAX result:%f \n",RAND_MAX); | |
*/ | |
float U2 = curand_uniform(&state[id]); | |
//printf("U2 result:%f \n",U2); | |
float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2); | |
/* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */ | |
return V1; | |
} | |
/* | |
* See device rgamma function. This is probably not the | |
* fastest way to generate random gamma variables on a CPU. | |
*/ | |
__host__ float rgamma(float a, float b){ | |
float d = a - 1.0 / 3; | |
float Y, U, v; | |
while(1){ | |
Y = rnorm(); | |
v = pow((1 + Y / sqrt(9 * d)), 3); | |
// Necessary to avoid taking the log of a negative number later. | |
if(v <= 0) | |
continue; | |
U = rand() / float(RAND_MAX); | |
// Accept the sample under the following condition. | |
// Otherwise repeat loop. | |
if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) | |
return d * v / b; | |
} | |
} | |
/* | |
* Generate a single Gamma distributed random variable by the Marsoglia | |
* algorithm (George Marsaglia, Wai Wan Tsang; 2001). | |
* | |
* Zeb chose this algorithm because it has a very high acceptance rate (>96%), | |
* so this while loop will usually only need to run a few times. Many other | |
* algorithms, while perhaps faster on a CPU, have acceptance rates on the | |
* order of 50% (very bad in a massively parallel context). | |
*/ | |
__device__ float rgamma(curandState *state, int id, float a, float b){ | |
float d = a - 1.0 / 3; | |
float Y, U, v; | |
while(1){ | |
Y = curand_normal(&state[id]); | |
v = pow((1 + Y / sqrt(9 * d)), 3); | |
/* Necessary to avoid taking the log of a negative number later. */ | |
if(v <= 0) | |
continue; | |
U = curand_uniform(&state[id]); | |
/* Accept the sample under the following condition. | |
Otherwise repeat loop. */ | |
if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) | |
return d * v / b; | |
} | |
} | |
/* | |
* Sample each theta from the appropriate gamma distribution | |
*/ | |
__global__ void sample_theta(curandState *state, | |
float *theta, float *log_theta, int *y, float *n, | |
float a, float b, int K){ | |
int id = threadIdx.x + blockIdx.x * blockDim.x; | |
float hyperA, hyperB; | |
if(id < K){ | |
hyperA = a + y[id]; | |
hyperB = b + n[id]; | |
theta[id] = rgamma(state, id, hyperA, hyperB); | |
log_theta[id] = log(theta[id]); | |
} | |
} | |
/* | |
* Initialize GPU random number generators | |
*/ | |
__global__ void setup_kernel(curandState *state, unsigned int seed, int K){ | |
int id = threadIdx.x + blockIdx.x * blockDim.x; | |
if(id < K) | |
curand_init(seed, id, 0, &state[id]); | |
} |