diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu new file mode 100644 index 0000000..a50fe13 --- /dev/null +++ b/parallel/para_gibbs.cu @@ -0,0 +1,397 @@ +/* +Created by Zebulun Arendsee. +March 26, 2013 + +Modified by Will Landau. +June 30, 2013 +will-landau.com +landau@iastate.edu + +This program implements a MCMC algorithm for the following hierarchical +model: + +y_k ~ Poisson(n_k * theta_k) k = 1, ..., K +theta_k ~ Gamma(a, b) +a ~ Unif(0, a0) +b ~ Unif(0, b0) + +We let a0 and b0 be arbitrarily large. + +Arguments: + 1) input filename + With two space delimited columns holding integer values for + y and float values for n. + 2) number of trials (1000 by default) + +Output: A comma delimited file containing a column for a, b, and each +theta. All output is written to stdout. + +Example dataset: + +$ head -3 data.txt +4 0.91643 +23 3.23709 +7 0.40103 + +Example of compilation and execution: + +$ nvcc gibbs_metropolis.cu -o gibbs +$ ./gibbs mydata.txt 2500 > output.csv +$ + +This code borrows from the nVidia developer zone documentation, +specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1 +*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define PI 3.14159265359f +#define THREADS_PER_BLOCK 64 + +#define CUDA_CALL(x) {if((x) != cudaSuccess){ \ + printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \ + printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \ + exit(EXIT_FAILURE);}} + +#define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n",__FILE__,__LINE__); \ + printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \ + exit(EXIT_FAILURE);}} + +__host__ void load_data(int argc, char **argv, int *K, int **y, float **n); + +__host__ float sample_a(float a, float b, int K, float sum_logs); +__host__ float sample_b(float a, int K, float flat_sum); + +__host__ float rnorm(); +__host__ float rgamma(float a, float b); + +__device__ float rgamma(curandState *state, int id, float a, float b); + +__global__ void sample_theta(curandState *state, float *theta, float *log_theta, + int *y, float *n, float a, float b, int K); +__global__ void setup_kernel(curandState *state, unsigned int seed, int); + + +__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, + float *theta, float *log_theta, + float a, float b, int trials); +__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, + float a, float b, int K, curandState *state); + +int main(int argc, char **argv){ + + curandState *devStates; + float a, b, flat_sum, sum_logs, *n, *dev_n, *dev_theta, *dev_log_theta; + int i, K, *y, *dev_y, nBlocks, trials = 1000; + + if(argc > 2) + trials = atoi(argv[2]); + + load_data(argc, argv, &K, &y, &n); + + /* starting values of hyperparameters */ + a = 20; + b = 1; + + /*------ Allocate memory -----------------------------------------*/ + + CUDA_CALL(cudaMalloc((void **)&dev_y, K * sizeof(int))); + CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int), + cudaMemcpyHostToDevice)); + + CUDA_CALL(cudaMalloc((void **)&dev_n, K * sizeof(float))); + CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float), + cudaMemcpyHostToDevice)); + + /* Allocate space for theta and log_theta on device and host */ + CUDA_CALL(cudaMalloc((void **)&dev_theta, K * sizeof(float))); + CUDA_CALL(cudaMalloc((void **)&dev_log_theta, K * sizeof(float))); + + /* Allocate space for random states on device */ + CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); + + + /*------ Setup random number generators (one per thread) ---------*/ + + //nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + nBlocks = 500; + setup_kernel<<>>(devStates, 0, K); + seqMetroProcess<<>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); + + /*------ Free Memory -------------------------------------------*/ + + free(y); + free(n); + + CUDA_CALL(cudaFree(devStates)); + CUDA_CALL(cudaFree(dev_theta)); + CUDA_CALL(cudaFree(dev_log_theta)); + CUDA_CALL(cudaFree(dev_y)); + CUDA_CALL(cudaFree(dev_n)); + + return EXIT_SUCCESS; +} + + +/* + * Sample each theta from the appropriate gamma distribution + */ + +__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, + float a, float b, int K, curandState *state){ + + float hyperA, hyperB; + + for ( int i = 0; i < K; i++ ){ + hyperA = a + y[i]; + hyperB = b + n[i]; + theta[i] = rgamma(state,i,hyperA, hyperB); + log_theta[i] = log(theta[i]); + } +} + +__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, + float *theta, float *log_theta, + float a, float b, int trials){ + /*------ MCMC ----------------------------------------------------*/ + int i; + + int start = blockIdx.x * K/nBlocks; + int lengthPerBlock = K/nBlocks; + //partition the data + int *yy = &y[start]; + float *nn = &n[start]; + float *sTheta = &theta[start]; + float *sLogTheta = &log_theta[start]; + + printf("block id:%d\n",blockIdx.x); + for(int j = 0; j < lengthPerBlock ; j++) { + printf("%d ", yy[j]); + } + printf("alpha, beta\n"); + + + + /* Steps of MCMC */ + + + for(i = 0; i < trials; i++){ + //sample_theta<<>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K); + sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, K, state); + /* Make iterators for thetas and log thetas. */ + // thrust::device_ptr theta(dev_theta); + // thrust::device_ptr log_theta(dev_log_theta); + + /* Compute pairwise sums of thetas and log_thetas. */ + // flat_sum = thrust::reduce(theta, theta + K); + // sum_logs = thrust::reduce(log_theta, log_theta + K); + + /* Sample hyperparameters. */ + // a = sample_a(a, b, K, sum_logs); + // b = sample_b(a, K, flat_sum); + + /* print hyperparameters. */ + printf("%f, %f\n", a, b); + } +} + + +/* + * Read in data. + */ + +__host__ void load_data(int argc, char **argv, int *K, int **y, float **n){ + int k; + char line[128]; + FILE *fp; + + if(argc > 1){ + fp = fopen(argv[1], "r"); + } else { + printf("Please provide input filename\n"); + exit(EXIT_FAILURE); + } + + if(fp == NULL){ + printf("Cannot read file \n"); + exit(EXIT_FAILURE); + } + + *K = 0; + while( fgets (line, sizeof line, fp) != NULL ) + (*K)++; + + rewind(fp); + + *y = (int*) malloc((*K) * sizeof(int)); + *n = (float*) malloc((*K) * sizeof(float)); + + for(k = 0; k < *K; k++) + fscanf(fp, "%d %f", *y + k, *n + k); + + fclose(fp); +} + + +/* + * Metropolis algorithm for producing random a values. + * The proposal distribution in normal with a variance that + * is adjusted at each step. + */ + +__host__ float sample_a(float a, float b, int K, float sum_logs){ + + static float sigma = 2; + float U, log_acceptance_ratio, proposal = rnorm() * sigma + a; + + if(proposal <= 0) + return a; + + log_acceptance_ratio = (proposal - a) * sum_logs + + K * (proposal - a) * log(b) - + K * (lgamma(proposal) - lgamma(a)); + + U = rand() / float(RAND_MAX); + + if(log(U) < log_acceptance_ratio){ + sigma *= 1.1; + return proposal; + } else { + sigma /= 1.1; + return a; + } +} + + +/* + * Sample b from a gamma distribution. + */ + +__host__ float sample_b(float a, int K, float flat_sum){ + + float hyperA = K * a + 1; + float hyperB = flat_sum; + return rgamma(hyperA, hyperB); +} + + +/* + * Box-Muller Transformation: Generate one standard normal variable. + * + * This algorithm can be more efficiently used by producing two + * random normal variables. However, for the CPU, much faster + * algorithms are possible (e.g. the Ziggurat Algorithm); + * + * This is actually the algorithm chosen by NVIDIA to calculate + * normal random variables on the GPU. + */ + +__host__ float rnorm(){ + + float U1 = rand() / float(RAND_MAX); + float U2 = rand() / float(RAND_MAX); + float V1 = sqrt(-2 * log(U1)) * cos(2 * PI * U2); + /* float V2 = sqrt(-2 * log(U2)) * cos(2 * PI * U1); */ + return V1; +} + + +/* + * See device rgamma function. This is probably not the + * fastest way to generate random gamma variables on a CPU. + */ + +__host__ float rgamma(float a, float b){ + + float d = a - 1.0 / 3; + float Y, U, v; + + while(1){ + Y = rnorm(); + v = pow((1 + Y / sqrt(9 * d)), 3); + + // Necessary to avoid taking the log of a negative number later. + if(v <= 0) + continue; + + U = rand() / float(RAND_MAX); + + // Accept the sample under the following condition. + // Otherwise repeat loop. + if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) + return d * v / b; + } +} + +/* + * Generate a single Gamma distributed random variable by the Marsoglia + * algorithm (George Marsaglia, Wai Wan Tsang; 2001). + * + * Zeb chose this algorithm because it has a very high acceptance rate (>96%), + * so this while loop will usually only need to run a few times. Many other + * algorithms, while perhaps faster on a CPU, have acceptance rates on the + * order of 50% (very bad in a massively parallel context). + */ + +__device__ float rgamma(curandState *state, int id, float a, float b){ + + float d = a - 1.0 / 3; + float Y, U, v; + + while(1){ + Y = curand_normal(&state[id]); + v = pow((1 + Y / sqrt(9 * d)), 3); + + /* Necessary to avoid taking the log of a negative number later. */ + if(v <= 0) + continue; + + U = curand_uniform(&state[id]); + + /* Accept the sample under the following condition. + Otherwise repeat loop. */ + if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) + return d * v / b; + } +} + + +/* + * Sample each theta from the appropriate gamma distribution + */ + +__global__ void sample_theta(curandState *state, + float *theta, float *log_theta, int *y, float *n, + float a, float b, int K){ + + int id = threadIdx.x + blockIdx.x * blockDim.x; + float hyperA, hyperB; + + if(id < K){ + hyperA = a + y[id]; + hyperB = b + n[id]; + theta[id] = rgamma(state, id, hyperA, hyperB); + log_theta[id] = log(theta[id]); + } +} + + +/* + * Initialize GPU random number generators + */ + +__global__ void setup_kernel(curandState *state, unsigned int seed, int K){ + + int id = threadIdx.x + blockIdx.x * blockDim.x; + + if(id < K) + curand_init(seed, id, 0, &state[id]); +}