|
@@ -87,20 +87,22 @@ __global__ void seqMetroProcess(int K, int nBlocks, int *y, float *n, curandStat |
|
|
__device__ void sample_theta_seq(float *theta, float *log_theta, int *y, float *n, |
|
|
float a, float b, int K, curandState *state); |
|
|
|
|
|
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads); |
|
|
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results); |
|
|
|
|
|
__global__ void sampleTdot(int trials, int *tDot,curandState *state); |
|
|
|
|
|
__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads); |
|
|
|
|
|
__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h); |
|
|
__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h); |
|
|
|
|
|
__device__ float normPDF(float x, float mean, float sigma); |
|
|
|
|
|
__device__ float general_normoral(curandState *state, float mean, float variance); |
|
|
|
|
|
int main(int argc, char **argv){ |
|
|
|
|
|
curandState *devStates; |
|
|
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out; |
|
|
float a, b, *n, *dev_n, *dev_theta, *dev_log_theta, *dev_a_out, *dev_b_out, *results; |
|
|
int K, *y, *dev_y, nBlocks,nThreads, trials = 1000; |
|
|
|
|
|
if(argc>4){ |
|
@@ -137,8 +139,9 @@ int main(int argc, char **argv){ |
|
|
/* Allocate space for random states on device */ |
|
|
CUDA_CALL(cudaMalloc((void **)&devStates, K * sizeof(curandState))); |
|
|
/////////////////////////allocate space on dev memory to save a and b |
|
|
CUDA_CALL(cudaMalloc((void **)&dev_a_out,nBlocks*nThreads*trials*sizeof(float))); |
|
|
CUDA_CALL(cudaMalloc((void **)&dev_b_out,nBlocks*nThreads*trials*sizeof(float))); |
|
|
CUDA_CALL(cudaMalloc((void **)&dev_a_out, nBlocks*nThreads*trials*sizeof(float))); |
|
|
CUDA_CALL(cudaMalloc((void **)&dev_b_out, nBlocks*nThreads*trials*sizeof(float))); |
|
|
CUDA_CALL(cudaMalloc((void **)&results, nThreads*sizeof(float))); |
|
|
|
|
|
/*------ Setup random number generators (one per thread) ---------*/ |
|
|
|
|
@@ -161,8 +164,12 @@ int main(int argc, char **argv){ |
|
|
|
|
|
int numThreads = trials / nBlocks; // TODO: This should be a whole number |
|
|
mergePosterior<<<1,1>>>(trials,dev_a_out,dev_b_out,tDot,devStates, nBlocks, nThreads); |
|
|
|
|
|
|
|
|
/*------ Free Memory -------------------------------------------*/ |
|
|
|
|
|
|
|
|
|
|
|
free(y); |
|
|
free(n); |
|
|
|
|
@@ -183,7 +190,7 @@ __global__ void sampleTdot(int trials, int *tDot,curandState *state){ |
|
|
} |
|
|
|
|
|
|
|
|
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads){ |
|
|
__global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int *tDot,curandState *state, int nBlocks, int nThreads, float *h, float *results){ |
|
|
|
|
|
// printf("\n all blocks finished\n"); |
|
|
// for(int j = 0; j < nBlocks*2 ; j++) { |
|
@@ -197,17 +204,31 @@ __global__ void mergePosterior(int trials, float *dev_a_out,float *dev_b_out,int |
|
|
int id = threadIdx.x + blockIdx.x * blockDim.x; |
|
|
float h = id^(-1/(4+2)); |
|
|
|
|
|
for (int i=0; i < M; i++) { |
|
|
int *cDot; |
|
|
CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); |
|
|
int cm = (curand(&state[id]) % (trials-1)) + 1; |
|
|
int u = curand_uniform(&state[id]); |
|
|
// TODO: Compute wcDot |
|
|
// TODO: Compute WtDot |
|
|
if () |
|
|
for (int i=0; i < nThreads; i++) { |
|
|
for (int m=0; m < M; m++) { |
|
|
int *cDot; |
|
|
CUDA_CALL(cudaMemcpy(cDot, tDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); |
|
|
cDot[m] = (curand(&state[id]) % (trials-1)) + 1; |
|
|
int u = curand_uniform(&state[id]); |
|
|
float wcDot = computeW(cDot, dev_a_out, dev_b_out, M, nThreads, h[i]); |
|
|
float wtDot = computeW(tDot, dev_a_out, dev_b_out, M, nThreads, h[i]); |
|
|
if (u < (wcDot/ wtDot) ) |
|
|
CUDA_CALL(cudaMemcpy(tDot, cDot, sizeof(int) * trials, cudaMemcpyDeviceToDevice)); |
|
|
} |
|
|
// TODO: Draw from Multivariate Normal, and Save into the results |
|
|
float posterior_mean_a = posteriorMean(tDot, dev_a_out, M, nThreads); |
|
|
float posterior_mean_b = posteriorMean(tDot, dev_b_out, M, nThreads); |
|
|
float variance = (h[i]^2)/M; |
|
|
|
|
|
printf('%f %f' , norm(state, posterior_mean_a, variance), norm(state, posterior_mean_b, variance)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
__device__ float general_normoral(curandState *state, float mean, float variance) { |
|
|
return curand_normal(&state[id]) * variance + mean; |
|
|
} |
|
|
|
|
|
__device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) { |
|
|
float sum = 0; |
|
|
for (int i=0; i < M; i++) { |
|
@@ -218,22 +239,25 @@ __device__ float posteriorMean(int *tDot, float *dev_x_out, int M, int nThreads) |
|
|
return mean; |
|
|
} |
|
|
|
|
|
__device__ float computeW(int *tDot, float *dev_x_out, int M, int nThreads, float h) { |
|
|
float posterior_mean = posteriorMean(tDot, dev_x_out, M, nThreads); |
|
|
__device__ float computeW(int *tDot, float *dev_x_out, float *dev_y_out, int M, int nThreads, float h) { |
|
|
float posterior_mean_x = posteriorMean(tDot, dev_x_out, M, nThreads); |
|
|
float posterior_mean_y = posteriorMean(tDot, dev_y_out, M, nThreads); |
|
|
float product = 1; |
|
|
float posterior_m_tm; |
|
|
float posterior_m_tm_x; |
|
|
float posterior_m_tm_y; |
|
|
|
|
|
for (int i=0; i < M; i++) { |
|
|
int index = tDot[i] + i * nThreads; // trial m of posterior m (note: i = blockId) |
|
|
posterior_m_tm = dev_x_out[index]; // posterior_m_tm |
|
|
product *= normPDF(posterior_m_tm, posterior_mean, h^2); |
|
|
posterior_m_tm_x = dev_x_out[index]; // posterior_m_tm of x |
|
|
posterior_m_tm_y = dev_y_out[index]; // posterio_m_tm_ of y |
|
|
product *= normPDF(posterior_m_tm_x, posterior_mean_x, h^2) * normPDF(posterior_m_tm_y, posterior_mean_y, h^2); |
|
|
} |
|
|
return product; |
|
|
} |
|
|
|
|
|
__device__ float normPDF(float x, float mean, float sigma) { |
|
|
float denominator = sqrtf(2*PI*(sigma^2)); |
|
|
float numerator = expf( -1 * (x-mean)^2 / (2*sigma^2) ); |
|
|
__device__ float normPDF(float x, float mean, float variance) { |
|
|
float denominator = sqrtf(2*PI*(variance^2)); |
|
|
float numerator = expf( -1 * (x-mean)^2 / (2*variance^2) ); |
|
|
return numerator/denominator; |
|
|
} |
|
|
|
|
|