From be06d94a15928d77d660e6bf6667b6fdd2f2dda5 Mon Sep 17 00:00:00 2001 From: Reynaldo Morillo Date: Mon, 24 Apr 2017 22:34:47 -0400 Subject: [PATCH] Created posteriorMean function. --- parallel/para_gibbs.cu | 185 ++++++++++++++++++++++------------------- 1 file changed, 100 insertions(+), 85 deletions(-) diff --git a/parallel/para_gibbs.cu b/parallel/para_gibbs.cu index d8d04d5..4b9a7af 100644 --- a/parallel/para_gibbs.cu +++ b/parallel/para_gibbs.cu @@ -13,7 +13,7 @@ model: y_k ~ Poisson(n_k * theta_k) k = 1, ..., K theta_k ~ Gamma(a, b) a ~ Unif(0, a0) -b ~ Unif(0, b0) +b ~ Unif(0, b0) We let a0 and b0 be arbitrarily large. @@ -39,7 +39,7 @@ $ nvcc gibbs_metropolis.cu -o gibbs $ ./gibbs mydata.txt 2500 > output.csv $ -This code borrows from the nVidia developer zone documentation, +This code borrows from the nVidia developer zone documentation, specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1 */ @@ -58,7 +58,7 @@ specifically http://docs.nvidia.com/cuda/curand/index.html#topic_1_2_1 #define CUDA_CALL(x) {if((x) != cudaSuccess){ \ printf("CUDA error at %s:%d\n",__FILE__,__LINE__); \ printf(" %s\n", cudaGetErrorString(cudaGetLastError())); \ - exit(EXIT_FAILURE);}} + exit(EXIT_FAILURE);}} #define CURAND_CALL(x) {if((x) != CURAND_STATUS_SUCCESS) { \ printf("Error at %s:%d\n",__FILE__,__LINE__); \ @@ -75,22 +75,24 @@ __host__ float rgamma(float a, float b); __device__ float rgamma(curandState *state, int id, float a, float b); -__global__ void sample_theta(curandState *state, float *theta, float *log_theta, +__global__ void sample_theta(curandState *state, float *theta, float *log_theta, int *y, float *n, float a, float b, int K); __global__ void setup_kernel(curandState *state, unsigned int seed, int); -__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, - float *theta, float *log_theta, +__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, + float *theta, float *log_theta, float a, float b,float *dev_a_out,float *dev_b_out, int trials); __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, float a, float b, int K, curandState *state); -__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks); +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads); __global__ void sampleTdot(int trials, int *tDot,curandState *state); +__device__ float posteriorMean(*tDot, float *dev_x_out, int M, int nThreads); + int main(int argc, char **argv){ curandState *devStates; @@ -111,17 +113,17 @@ int main(int argc, char **argv){ load_data(argc, argv, &K, &y, &n); /* starting values of hyperparameters */ - a = 20; - b = 1; + a = 20; + b = 1; /*------ Allocate memory -----------------------------------------*/ CUDA_CALL(cudaMalloc((void **)&dev_y, K * sizeof(int))); - CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int), + CUDA_CALL(cudaMemcpy(dev_y, y, K * sizeof(int), cudaMemcpyHostToDevice)); CUDA_CALL(cudaMalloc((void **)&dev_n, K * sizeof(float))); - CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float), + CUDA_CALL(cudaMemcpy(dev_n, n, K * sizeof(float), cudaMemcpyHostToDevice)); /* Allocate space for theta and log_theta on device and host */ @@ -139,12 +141,12 @@ int main(int argc, char **argv){ //nBlocks = (K + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; //nBlocks = 1; setup_kernel<<>>(devStates, 0, K); - + //printf("%f",&dev_a_out); printf("alpha, beta\n"); //seqMetroProcess<<>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,trials); seqMetroProcess<<>>(K,nBlocks,dev_y,dev_n,devStates,dev_theta,dev_log_theta,a,b,dev_a_out,dev_b_out,trials); - + int *tDot; CUDA_CALL(cudaMalloc((void **)&tDot,trials*sizeof(int))); @@ -153,8 +155,8 @@ int main(int argc, char **argv){ sampleTdot<<>>(trials, tDot,devStates); - - mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates,nBlocks); + int numThreads = trials / nBlocks; // TODO: This should be a whole number + mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads); /*------ Free Memory -------------------------------------------*/ free(y); @@ -173,30 +175,43 @@ __global__ void sampleTdot(int trials, int *tDot,curandState *state){ int id = threadIdx.x + blockIdx.x * blockDim.x; int u = (curand(&state[id]) % (trials-1)) + 1; //printf("thread %d sample: %d",id,u); - tDot[id] = u; + tDot[id] = u; } -__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot, curandState *state,int nBlocks){ - -/* printf("\n all blocks finished\n"); - for(int j = 0; j < trials ; j++) { - printf(" %f ", *dev_a_out); - printf(" %f \n",*dev_b_out); - dev_a_out++; - dev_b_out++; - } -*/ +__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads){ - printf("\n all blocks finished\n"); - for(int j = 0; j < nBlocks*2 ; j++) { - // printf(" %f ", *dev_a_out); - printf(" %d \n",*tDot); - tDot++; - //dev_b_out++; - } + // printf("\n all blocks finished\n"); + // for(int j = 0; j < nBlocks*2 ; j++) { + // // printf(" %f ", *dev_a_out); + // printf(" %d \n",*tDot); + // tDot++; + // //dev_b_out++; + // } + int M = nBlocks * nThreads; + int id = threadIdx.x + blockIdx.x * blockDim.x; + float h = id^(-1/(4+2)); + + for (int i=0; i < M; i++) { + int *cDot; + CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); + int cm = (curand(&state[id]) % (trials-1)) + 1; + int u = curand_uniform(&state[id]); + // TODO: Compute wcDot + // TODO: Compute WtDot + if () + } +} +__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) { + float sum; + for (int i=0; i < M; i++) { + int index = tDot[i] + i * nThreads; // trial m of posterior m (note: i = blockId) + sum += dev_x_out[index]; // posterior_m_tm + } + float mean = sum/M; + return mean; } @@ -218,29 +233,29 @@ __device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float * } } -__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, - float *theta, float *log_theta, +__global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandState *state, + float *theta, float *log_theta, float a, float b,float *dev_a_out,float *dev_b_out, int trials){ /*------ MCMC ----------------------------------------------------*/ int i; //printf("K: %d \n ",K); int id = threadIdx.x + blockIdx.x * blockDim.x; - + int nTasks = nBlocks*blockDim.x; //int start = blockIdx.x * K/nBlocks; int start = id * K/nTasks; - int lengthPerBlock = K/nTasks; + int lengthPerBlock = K/nTasks; //partition the data int *yy; yy = y+start; float *nn; nn = n+start; - + float *sTheta; sTheta = theta+start; float *sLogTheta; - sLogTheta = log_theta+start; - /* + sLogTheta = log_theta+start; + /* printf("length per block:%d\n",lengthPerBlock); printf("start is:%d\n",start); printf("partial data under block id: %d \n ",blockIdx.x); @@ -252,7 +267,7 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat - + /* Steps of MCMC */ int rstart = id*trials/nTasks; int rlenPerTask = trials; @@ -260,12 +275,12 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat subA = dev_a_out+rstart; float *subB; subB = dev_b_out+rstart; - - - for(i = 0; i < rlenPerTask; i++){ + + + for(i = 0; i < rlenPerTask; i++){ //sample_theta<<>>(devStates, dev_theta, dev_log_theta, dev_y, dev_n, a, b, K); sample_theta_seq(sTheta, sLogTheta, yy, nn, a, b, lengthPerBlock, state); - + float flat_sum=0; for(int ii=0;ii 1){ fp = fopen(argv[1], "r"); } else { @@ -321,33 +336,33 @@ __host__ void load_data(int argc, char **argv, int *K, int **y, float **n){ *K = 0; while( fgets (line, sizeof line, fp) != NULL ) - (*K)++; + (*K)++; rewind(fp); *y = (int*) malloc((*K) * sizeof(int)); - *n = (float*) malloc((*K) * sizeof(float)); - + *n = (float*) malloc((*K) * sizeof(float)); + for(k = 0; k < *K; k++) - fscanf(fp, "%d %f", *y + k, *n + k); - + fscanf(fp, "%d %f", *y + k, *n + k); + fclose(fp); } /* - * Metropolis algorithm for producing random a values. + * Metropolis algorithm for producing random a values. * The proposal distribution in normal with a variance that * is adjusted at each step. */ - + __device__ float sample_a(curandState *state,int id, float a, float b, int K, float sum_logs){ float sigma = 2.0; float norm = curand_normal(&state[id]); float U, log_acceptance_ratio, proposal = norm * sigma + a; //printf("rnorm result:%f \n",norm*sigma); - if(proposal <= 0) + if(proposal <= 0) return a; log_acceptance_ratio = (proposal - a) * sum_logs + @@ -380,7 +395,7 @@ __device__ float sample_b(curandState *state,int id, float a, int K, float flat_ } -/* +/* * Box-Muller Transformation: Generate one standard normal variable. * * This algorithm can be more efficiently used by producing two @@ -403,7 +418,7 @@ printf("host RANDMAX result:%f \n",RAND_MAX); return V1; } - + __device__ float rnorm(curandState *state,int id){ float U1 = curand_uniform(&state[id]); @@ -423,7 +438,7 @@ printf("RANDMAX result:%f \n",RAND_MAX); * See device rgamma function. This is probably not the * fastest way to generate random gamma variables on a CPU. */ - + __host__ float rgamma(float a, float b){ float d = a - 1.0 / 3; @@ -433,26 +448,26 @@ __host__ float rgamma(float a, float b){ Y = rnorm(); v = pow((1 + Y / sqrt(9 * d)), 3); - // Necessary to avoid taking the log of a negative number later. - if(v <= 0) + // Necessary to avoid taking the log of a negative number later. + if(v <= 0) continue; - + U = rand() / float(RAND_MAX); - // Accept the sample under the following condition. - // Otherwise repeat loop. + // Accept the sample under the following condition. + // Otherwise repeat loop. if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) return d * v / b; } } -/* - * Generate a single Gamma distributed random variable by the Marsoglia +/* + * Generate a single Gamma distributed random variable by the Marsoglia * algorithm (George Marsaglia, Wai Wan Tsang; 2001). * * Zeb chose this algorithm because it has a very high acceptance rate (>96%), - * so this while loop will usually only need to run a few times. Many other - * algorithms, while perhaps faster on a CPU, have acceptance rates on the + * so this while loop will usually only need to run a few times. Many other + * algorithms, while perhaps faster on a CPU, have acceptance rates on the * order of 50% (very bad in a massively parallel context). */ @@ -461,17 +476,17 @@ __device__ float rgamma(curandState *state, int id, float a, float b){ float d = a - 1.0 / 3; float Y, U, v; - while(1){ + while(1){ Y = curand_normal(&state[id]); v = pow((1 + Y / sqrt(9 * d)), 3); /* Necessary to avoid taking the log of a negative number later. */ - if(v <= 0) + if(v <= 0) continue; - + U = curand_uniform(&state[id]); - /* Accept the sample under the following condition. + /* Accept the sample under the following condition. Otherwise repeat loop. */ if(log(U) < 0.5 * pow(Y,2) + d * (1 - v + log(v))) return d * v / b; @@ -482,14 +497,14 @@ __device__ float rgamma(curandState *state, int id, float a, float b){ /* * Sample each theta from the appropriate gamma distribution */ - -__global__ void sample_theta(curandState *state, - float *theta, float *log_theta, int *y, float *n, + +__global__ void sample_theta(curandState *state, + float *theta, float *log_theta, int *y, float *n, float a, float b, int K){ - + int id = threadIdx.x + blockIdx.x * blockDim.x; float hyperA, hyperB; - + if(id < K){ hyperA = a + y[id]; hyperB = b + n[id]; @@ -499,14 +514,14 @@ __global__ void sample_theta(curandState *state, } -/* - * Initialize GPU random number generators +/* + * Initialize GPU random number generators */ - + __global__ void setup_kernel(curandState *state, unsigned int seed, int K){ int id = threadIdx.x + blockIdx.x * blockDim.x; - + if(id < K) curand_init(seed, id, 0, &state[id]); -} +}