diff --git a/BDA 4.7.5.ipynb b/BDA 4.7.5.ipynb new file mode 100644 index 0000000..1320e4b --- /dev/null +++ b/BDA 4.7.5.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Problem 4.7.5.\n", + "\n", + "Approximate mean and variance.\n", + "1. Suppose x and y are independent normally distributed random variables, where x~N(4,1) and y~N(3,2). What are the mean and standard deviations of y/x? Compute this using simulation.\n", + "\n", + "2. Do the same computation without simulation.\n", + "\n", + "3. What assumptions do you need for part (2)?" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from scipy.stats import norm" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean: 0.8079667283774233 sd: 0.6574088802979349\n" + ] + } + ], + "source": [ + "x=norm(4.0,1.0)\n", + "y=norm(3.0,2.0)\n", + "\n", + "samples_x=x.rvs(10000)\n", + "samples_y=y.rvs(10000)\n", + "z=samples_y/samples_x\n", + "print('mean:',np.mean(z),'sd:',np.sqrt(np.var(z)))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "h=norm(.807,.657)\n", + "fig,ax=plt.subplots(2,2)\n", + "ax[0,0].set_xlim(-5,5)\n", + "j=ax[0,0].hist(z,bins=100,density=True)\n", + "j=ax[0,1].hist(samples_x,bins=50,density=True)\n", + "j=ax[1,1].hist(samples_y,bins=50,density=True)\n", + "ax[0,0].plot(np.linspace(-5,5,100),h.pdf(np.linspace(-5,5,100)))\n", + "\n", + "b=plt.hist(z,bins=100)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 1.000e+00, 1.000e+00, 6.000e+00, 7.000e+00, 1.800e+01,\n", + " 5.800e+01, 1.330e+02, 3.230e+02, 6.530e+02, 1.059e+03, 1.401e+03,\n", + " 1.615e+03, 1.410e+03, 1.124e+03, 7.740e+02, 5.150e+02, 3.100e+02,\n", + " 2.150e+02, 1.320e+02, 7.000e+01, 5.800e+01, 2.700e+01, 2.600e+01,\n", + " 2.200e+01, 7.000e+00, 5.000e+00, 7.000e+00, 4.000e+00, 2.000e+00,\n", + " 3.000e+00, 2.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 1.000e+00,\n", + " 0.000e+00, 0.000e+00, 2.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,\n", + " 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00]),\n", + " array([-9.34383882, -9.13726736, -8.9306959 , -8.72412443, -8.51755297,\n", + " -8.31098151, -8.10441005, -7.89783858, -7.69126712, -7.48469566,\n", + " -7.2781242 , -7.07155274, -6.86498127, -6.65840981, -6.45183835,\n", + " -6.24526689, -6.03869543, -5.83212396, -5.6255525 , -5.41898104,\n", + " -5.21240958, -5.00583812, -4.79926665, -4.59269519, -4.38612373,\n", + " -4.17955227, -3.97298081, -3.76640934, -3.55983788, -3.35326642,\n", + " -3.14669496, -2.9401235 , -2.73355203, -2.52698057, -2.32040911,\n", + " -2.11383765, -1.90726619, -1.70069472, -1.49412326, -1.2875518 ,\n", + " -1.08098034, -0.87440888, -0.66783741, -0.46126595, -0.25469449,\n", + " -0.04812303, 0.15844844, 0.3650199 , 0.57159136, 0.77816282,\n", + " 0.98473428, 1.19130575, 1.39787721, 1.60444867, 1.81102013,\n", + " 2.01759159, 2.22416306, 2.43073452, 2.63730598, 2.84387744,\n", + " 3.0504489 , 3.25702037, 3.46359183, 3.67016329, 3.87673475,\n", + " 4.08330621, 4.28987768, 4.49644914, 4.7030206 , 4.90959206,\n", + " 5.11616352, 5.32273499, 5.52930645, 5.73587791, 5.94244937,\n", + " 6.14902083, 6.3555923 , 6.56216376, 6.76873522, 6.97530668,\n", + " 7.18187815, 7.38844961, 7.59502107, 7.80159253, 8.00816399,\n", + " 8.21473546, 8.42130692, 8.62787838, 8.83444984, 9.0410213 ,\n", + " 9.24759277, 9.45416423, 9.66073569, 9.86730715, 10.07387861,\n", + " 10.28045008, 10.48702154, 10.693593 , 10.90016446, 11.10673592,\n", + " 11.31330739]),\n", + " )" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "b\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/BDA 5.9.6.ipynb b/BDA 5.9.6.ipynb new file mode 100644 index 0000000..e75cef4 --- /dev/null +++ b/BDA 5.9.6.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Problem 5.9.6. Exchangeable Models\n", + "\n", + "1. In the divorce rate example of Section 5.2, set up a prior distribution for the values $y_1 \\ldots, y_8$ that allows for one low value (Utah) and one high value (Nevada), with independent and identical distributions for the other six values. This prior distribution should be exchangeable, because it is not known which of the eight states correspond to Utah and Nevada. \n", + "\n", + "2. Determine the posterior distribution for $y_8$ under this model given the observed values of $y_1, \\ldots, y_7$ given in the example. This posterior distribution should probably have two or three modes, corresponding to the possibilities that the missing state is Utah, Nevada, or one of the other six. \n", + "\n", + "3. Now consider the entire set of eight data points, including the value for $y_8$ given at the end of the example. Are these data consistent with the prior distribution you gave in part (1) above? In particular, did your prior distribution allow for the possibility that the actual data have an outlier (Nevada) at the high end, but no outlier at the low end?\n", + "\n", + "The states are Arizona, Colorado, Idaho, Montana, Nevada, New Mexico, Utah, and Wyoming.\n", + "\n", + "The rates from seven of these states are 5.8, 6.6,7.8,5.6,7.0,7.1,5.4 divorces per 1000 population per year. \n", + "\n", + "Gelman, Andrew; Carlin, John B.; Stern, Hal S.; Dunson, David B.; Vehtari, Aki; Rubin, Donald B.. Bayesian Data Analysis, Third Edition (Chapman & Hall/CRC Texts in Statistical Science) (Page 135). CRC Press. Kindle Edition. " + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean: 6.471428571428571 variance: 0.6877551020408161\n" + ] + } + ], + "source": [ + "from scipy.stats import norm\n", + "from itertools import permutations\n", + "import matplotlib.pyplot as plt\n", + "\n", + "rates=np.array([5.8,6.6,7.8,5.6,7.0,7.1,5.4])\n", + "print('mean:',rates.mean(),'variance:',rates.var())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [], + "source": [ + "s=range(8)\n", + "def prior(x):\n", + " L=0\n", + " for i,j in permutations(s,2):\n", + " L0=1\n", + " L0=L0*norm.pdf(x[i],loc=8.7,scale=.8)\n", + " L0=L0*norm.pdf(x[j],loc=4.3,scale=.8)\n", + " for k in s:\n", + " if k!=i and k!=j:\n", + " L0=L0*norm.pdf(x[k],loc=6.5,scale=.8)\n", + " L=L+L0\n", + " return(L)\n", + "\n", + "def like(x):\n", + " return prior(np.append(x,rates))" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "x=np.linspace(0.0,15.0,1000)\n", + "y=np.array([like(i) for i in x])" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[]" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.plot(x,y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I have a lot of questions about this problem, starting with: have I computed the right thing? I am not sure.\n", + "As far as the last part, No, my prior distribution did not allow for a high value without a low value; it expected one of each." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/BDA 5.9.8.ipynb b/BDA 5.9.8.ipynb new file mode 100644 index 0000000..d3ea631 --- /dev/null +++ b/BDA 5.9.8.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Discrete Mixture Models\n", + "\n", + "Discrete mixture models: if $p_m(\\theta)$, for $m=1,\\ldots,M$ are conjugate prior densities for the sampling model $y|\\theta$, show that the class of finite mixture prior densities given by \n", + "$$\n", + "p(\\theta)=\\sum_{1}^{M} \\lambda_m p_m(\\theta)\n", + "$$\n", + "is also a conjugate class, where the $\\lambda_m$’s are nonnegative weights that sum to 1. This can provide a useful extension of the natural conjugate prior family to more flexible distributional forms. As an example, use the mixture form to create a bimodal prior density for a normal mean, that is thought to be near $1$, with a standard deviation of $0.5$, but has a small probability of being near $−1$, with the same standard deviation. If the variance of each observation $y_1,\\ldots,y_{10}$ is known to be $1$, and their observed mean is $y =−0.25$, derive your posterior distribution for the mean, making a sketch of both prior and posterior densities. Be careful: the prior and posterior mixture proportions are different.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's skip the theory part and look at the example.\n", + "\n", + "We have\n", + "$$\n", + "p(\\theta|y_1,\\ldots,y_{10})\\propto p(y_1,\\ldots,y_10|\\theta)p(\\theta)$$\n", + "so\n", + "$$\n", + "p(\\theta|\\{y_{i}\\})\\propto \\sum \\lambda_{m}p(\\{y_{i}\\}|\\theta)p_{m}(\\theta)\n", + "$$\n", + "\n", + "Each of the terms $p_{m}(\\theta)p(\\{y_{i}\\}|\\theta)$\n", + "is equal to $p_{m}(\\theta|\\{y_{i}\\})p_{m}(\\{y_{i}\\})$.\n", + "\n", + "Therefore the total posterior density is a weighted sum\n", + "of the individual posteriors:\n", + "\n", + "$$p(\\theta|\\{y_{i}\\})=\\sum c_{m}p_{m}(\\theta|\\{y_{i}\\})$$\n", + "where \n", + "$$\n", + "c_{m}=\\frac{\\lambda_m p_{m}(\\{y_{i}\\})}{\\sum_{m} \\lambda_m p_{m}(\\{y_{i}\\}}\n", + "$$\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the special case under consideration, $p_1$ is normal with mean $-1$ and $\\sigma=.5$, $p_2$ is normal with mean $1$ and $\\sigma=.5$ and we can set $\\lambda_1=.1$ and $\\lambda_2=.9$. The $p_m(\\{y_{i}\\})$ can be calculated from the $t$ distribution. Drawing a sample of size $10$ from $p_1$ and getting a sample mean of $-.25$ and a sample variance of $1$ gives a $t$-statistics of $\\sqrt{10}(-.25+1)$ in the first case and $\\sqrt{10}(-.25-1)$ in the second. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.041664931082753924\n", + "0.0035119750957915393\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "from scipy.stats import norm, t\n", + "t_1=np.sqrt(9)*.75\n", + "t_2=np.sqrt(9)*1.25\n", + "print(t.pdf(t_1,df=9))\n", + "print(t.pdf(t_2,df=9))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5655172413793104" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + ".1*.041/(.1*.041+.9*.0035)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.43448275862068964" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + ".9*.0035/(.1*.041+.9*.0035)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Useful Formulae.ipynb b/Useful Formulae.ipynb new file mode 100644 index 0000000..3fc65f4 --- /dev/null +++ b/Useful Formulae.ipynb @@ -0,0 +1,144 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Conjugate Normal Distributions (known variance)\n", + "\n", + "We are trying to learn about the unknown mean of a normal distribution with known variance. \n", + "We choose a prior distribution is normal with mean $\\mu_{0}$ and variance $\\tau_{0}^2$. \n", + "We draw $n$ values $y_1,\\ldots, y_n$ from the distribution with known variance $\\sigma^2$. The posterior distribution\n", + "$p(\\mu|y_1,\\ldots,y_n)=p(y_1,\\ldots,y_n|\\mu)p(\\mu)$ is again normal. Let \n", + "$$\n", + "\\overline{y}=\\frac{1}{n}\\sum_{i=1}^{n} y_i\n", + "$$\n", + "be the sample mean. \n", + "\n", + "The posterior variance\n", + "is\n", + "$$\\frac{1}{\\tau_1^2}=\\frac{1}{\\tau_0^2}+\\frac{n}{\\sigma^2}$$\n", + "and the posterior mean is\n", + "$$\n", + "\\mu_1=\\frac{\\frac{\\mu_0}{\\tau_0^2}+\\frac{n\\overline{y}}{\\sigma^2}}{\\frac{1}{\\tau_{1}^2}}\n", + "$$\n", + "\n", + "The posterior sampling distribution $\\theta$ is\n", + "$$\n", + "p( z |y)=\\int_{\\theta} p(z|\\theta) d\\theta\n", + "$$\n", + "is a normal distribution with mean equal to the posterior mean $\\mu_1$ and variance equal to $\\sigma^2+\\tau_1^2$\n", + "where $\\tau_1$ is the posterior variance.\n", + "\n", + "See Pages 39-42 of BDA (Section 2.5) for more information." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from scipy.stats import norm\n", + "import numpy as np\n", + "def posterior(prior_mean,prior_variance,sample_mean,pop_variance,n):\n", + " post_var=1/((1/prior_variance) + n/pop_variance)\n", + " post_mean=(prior_mean/prior_variance+sample_mean*n/pop_variance)/(1/post_var)\n", + " return post_mean, post_var\n", + "\n", + "def post_sample(y,prior_mean,prior_variance,sample_mean,pop_variance,n):\n", + " post_mean,post_var=posterior(prior_mean,prior_variance,sample_mean,pop_variance,n)\n", + " return norm.pdf(y,post_mean,np.sqrt(pop_variance+post_var))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7403867575800461" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "post_sample(-.25,1,.25,-.25,1,10)+post_sample(-.25,-1,.25,-.25,1,10)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.05095226579074726" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + ".1*post_sample(-.25,-1,.25,-.25,1,10)/(post_sample(-.25,1,.25,-.25,1,10)+post_sample(-.25,-1,.25,-.25,1,10))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.4414296078832747" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + ".9*post_sample(-.25,1,.25,-.25,1,10)/(post_sample(-.25,1,.25,-.25,1,10)+post_sample(-.25,-1,.25,-.25,1,10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}