185 lines
11 KiB
TeX
185 lines
11 KiB
TeX
\documentclass{article}
|
||
|
||
% set up telugu
|
||
\usepackage{fontspec}
|
||
\newfontfamily\telugufont{Potti Sreeramulu}[Script = Telugu]
|
||
\usepackage{polyglossia}
|
||
\setdefaultlanguage{english}
|
||
\setotherlanguage{telugu}
|
||
|
||
%other packages
|
||
\usepackage{amsmath}
|
||
\usepackage{bm}
|
||
\usepackage{amssymb}
|
||
\usepackage{physics}
|
||
\usepackage{siunitx}
|
||
\usepackage{todonotes}
|
||
\usepackage{luacode}
|
||
\usepackage{titling}
|
||
\usepackage{enumitem}
|
||
|
||
% custom deepak packages
|
||
\usepackage{luatrivially}
|
||
\usepackage{subtitling}
|
||
|
||
\usepackage{cleveref}
|
||
|
||
\begin{luacode*}
|
||
math.randomseed(31415926)
|
||
\end{luacode*}
|
||
|
||
\renewcommand{\vec}{\bm}
|
||
\newcommand{\transpose}[1]{#1^T}
|
||
|
||
\title{Problem 1.15}
|
||
\subtitle{Fisher information and Cramér–Rao}
|
||
\author{\begin{telugu}హృదయ్ దీపక్ మల్లుభొట్ల\end{telugu}}
|
||
% want empty date
|
||
\predate{}
|
||
\date{}
|
||
\postdate{}
|
||
|
||
% !TeX spellcheck = en_GB
|
||
\begin{document}
|
||
\maketitle
|
||
We're gonna look at the least squares model.
|
||
\subsubsection*{(a) Dependence only on parameter distance.}
|
||
A least squares model $y_i(\vec{\theta})$ for N data points $d_i \pm \sigma$ with independent, normally distributed measurement errors predicts a likelihood for finding a value $\vec{x} = \left\{x_i\right\}$ of the data $\left\{d_i\right\}$ given by
|
||
\begin{equation}
|
||
P(\vec{x} | \vec{\theta}) = \frac{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}{\left(2 \pi \sigma^2\right)^{\flatfrac{N}{2}}}
|
||
\end{equation}
|
||
|
||
How big is the probability density that a least-squares model with true parameters $\vec{\theta}$ would give experimental results implying a different set of parameters $\vec{\phi}$?
|
||
Show that it depends only on the distance between the vectors $\abs{\vec{y}(\vec{\theta}) - \vec{y}(\vec{\phi})}$ in the prediction space.
|
||
|
||
\subsubsection*{(b) Metric}
|
||
Remember that the metric tensor $g_{\alpha\beta}$ gives the distance on the manifold between two nearby points.
|
||
The squared distance between points with coordinates $\vec{\theta}$ and $\vec{\theta} + \epsilon \vec{\Delta}$ is $\epsilon^2 \sum_{\alpha\beta} g_{\alpha\beta} \Delta_\alpha \Delta_\beta$.
|
||
|
||
Show that the least-squares metric is
|
||
\begin{equation}
|
||
g_{\alpha\beta} = \frac{\left(\transpose{J}J\right)_{\alpha\beta}}{\sigma^2},
|
||
\end{equation}
|
||
where the Jacobian is
|
||
\begin{equation}
|
||
J_{i\alpha} = \pdv{y_i}{\theta_\alpha}.
|
||
\end{equation}
|
||
|
||
\subsubsection*{(c) The Fisher Information Matrix}
|
||
In general, the natural metric is given by the FIM:
|
||
\begin{equation}
|
||
g_{\alpha \beta}(\vec{\theta}) = - \left<\pdv[2]{\log{P(\vec{x} | \vec{\theta} )}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}}
|
||
\end{equation}
|
||
|
||
\section{Solution} \label{sec:solution}
|
||
\subsection*{(a) Least squares distance}
|
||
\begin{equation}
|
||
P(\vec{x} | \vec{\theta}) = \frac{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}{\left(2 \pi \sigma^2\right)^{\flatfrac{N}{2}}}
|
||
\end{equation}
|
||
|
||
We're interested in $P(\vec{y}(\vec{\phi}) | \vec{\theta})$:
|
||
\begin{align}
|
||
P(\vec{y}(\vec{\phi}) | \vec{\theta}) &= \frac{e^{- \sum_i \flatfrac{\left(y_i(\vec{\phi}) - y_i(\vec{\theta})\right)^2}{2 \sigma^2}}}{\left(2 \pi\sigma^2\right)^{\flatfrac{N}{2}}}. \label{eq:a:pbeforesimple}
|
||
\end{align}
|
||
Look at this:
|
||
\begin{equation}
|
||
\sum_i \left(y_i(\vec{\phi}) - y_i(\vec{\theta})\right)^2.
|
||
\end{equation}
|
||
What even is that expression?
|
||
Well you might recognise it, fool.
|
||
It's just $\sum_i \left(\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})\right)_i^2$, otherwise known as
|
||
\begin{equation}
|
||
\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}^2.
|
||
\end{equation}
|
||
So plug that into \cref{eq:a:pbeforesimple}, and see what happens
|
||
\begin{align}
|
||
P(\vec{y}(\vec{\phi}) | \vec{\theta}) &= \frac{e^{- \flatfrac{\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}^2}{2 \sigma^2}}}{\left(2 \pi\sigma^2\right)^{\flatfrac{N}{2}}} \\
|
||
P(\vec{y}(\vec{\phi}) | \vec{\theta}) &= \frac{e^{-\frac12 \left(\frac{\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}}{\sigma}\right)^2}}{\left(2 \pi\sigma^2\right)^{\flatfrac{N}{2}}}. \label{eq:a:paftersimple}
|
||
\end{align}
|
||
|
||
That just depends on $\abs{\frac{\Delta\vec{y}}{\sigma}}$, as we were told to expect.
|
||
|
||
\subsection*{(b) Metric tensor}
|
||
Let's start by rescaling
|
||
\begin{align}
|
||
d^2 = \left(\frac{\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}}{\sigma}\right)^2
|
||
\end{align}
|
||
and look at what this is with $\vec{\phi} = \vec{\theta} + \epsilon \vec{\Delta}$, particularly as $\epsilon \rightarrow 0$.
|
||
\begin{align}
|
||
d^2 &= \left(\frac{\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}}{\sigma}\right)^2
|
||
\end{align}
|
||
What's $\vec{y}(\vec{\theta} + \epsilon \vec{\Delta})$?
|
||
As we expand this, let's use Einstein summation notation.
|
||
To first order in $\epsilon$, this is.
|
||
\begin{align}
|
||
\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) &= \vec{y}(\vec{\theta}) + \epsilon \Delta_\alpha \pdv{\vec{y}(\vec{\theta})}{\theta_\alpha}
|
||
\end{align}
|
||
Now square-magnitude the difference with $\vec{y}(\vec{\theta})$:
|
||
\begin{align}
|
||
\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \sum_i \left(y_i(\vec{\theta} + \epsilon \vec{\Delta}) - y_i(\vec{\theta})\right)^2 \\
|
||
&= \sum_i \left( y_i(\vec{\theta}) + \epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha} - y_i(\vec{\theta})\right)^2 \\
|
||
&= \sum_i \left(\epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right)^2 \\
|
||
&= \sum_{i} \left(\sum_\alpha \epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right)^2
|
||
\end{align}
|
||
Now this is the crucial nub of this stupid index twiddling.
|
||
The sums inside the squared are independent (as we see if we write out a nontrivial example).
|
||
So this really becomes:
|
||
\begin{align}
|
||
\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \sum_{i} \left(\sum_\alpha \epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right) \left(\sum_\beta \epsilon \Delta_\beta \pdv{y_i(\vec{\theta})}{\theta_\beta}\right) \\
|
||
\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \epsilon^2 \sum_{i, j} \left(\sum_\alpha \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right) \left(\sum_\beta \Delta_\beta \pdv{y_j(\vec{\theta})}{\theta_\beta}\right) \delta_{ij} \\
|
||
\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \epsilon^2 \left[\sum_{i, j} \left(\sum_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right) \left(\sum_\beta \pdv{y_j(\vec{\theta})}{\theta_\beta}\right) \delta_{ij}\right] \Delta_\alpha \Delta_\beta.
|
||
\end{align}
|
||
The bit in square brackets is just matrix multiplication of our Jacobian with its transpose!
|
||
Comparing with our definition of the metric tensor and switching to scaled units, we have our result.
|
||
|
||
\subsection*{(c) FIM}
|
||
The FIM:
|
||
\begin{align}
|
||
g_{\alpha \beta}(\vec{\theta}) &= - \left<\pdv[2]{\log{P(\vec{x} | \vec{\theta} )}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
|
||
g_{\alpha \beta}(\vec{\theta}) &= - \left<\pdv[2]{\log{\frac{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}{\left(2 \pi \sigma^2\right)^{\flatfrac{N}{2}}}}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
|
||
g_{\alpha \beta}(\vec{\theta}) &= - \left<\pdv[2]{\log{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
|
||
g_{\alpha \beta}(\vec{\theta}) &= - \left<- \pdv[2]{\sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
|
||
g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{2 \sigma^2} \left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \label{eq:c:gbeforesub}
|
||
\end{align}
|
||
Let's look at the first derivative:
|
||
\begin{align}
|
||
\pdv{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\beta} &= \sum_i \pdv{\left(y_i(\theta) - x_i\right)^2}{\theta_\beta} \\
|
||
\pdv{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\beta} &= \sum_i 2 \left(y_i(\theta) - x_i\right) \pdv{}{\theta_\beta} \left( y_i(\theta) - x_i \right) \\
|
||
\pdv{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\beta} &= 2 \sum_i \left(y_i(\theta) - x_i\right) \pdv{y_i}{\theta_\beta}
|
||
\end{align}
|
||
And the second derivative:
|
||
\begin{align}
|
||
\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} &= \pdv{}{\theta_\alpha} 2 \sum_i \left(y_i(\theta) - x_i\right) \pdv{y_i}{\theta_\beta} \\
|
||
\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} &= 2 \sum_i \left(\pdv{}{\theta_\alpha} \left(y_i(\theta) - x_i\right) \right) \pdv{y_i}{\theta_\beta} + \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta} \\
|
||
\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} &= 2 \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta} + \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta}
|
||
\end{align}
|
||
Now average that over $\vec{x}$:
|
||
\begin{align}
|
||
\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= \left< 2 \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta} + \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta}\right>_{\vec{x}} \\
|
||
\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta} + \left< \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta}\right>_{\vec{x}} \\
|
||
\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta} + \left(y_i(\theta) - \left<x_i \right>_{\vec{x}}\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta} \label{eq:c:stepbeforesub}
|
||
\end{align}
|
||
What is $\left<x_i\right>_{\vec{x}}$?
|
||
Nothing but $y_i(\theta)$!
|
||
If that seems confusing, remember that the average looks like:
|
||
\begin{align}
|
||
\left<x_i\right>_{\vec{x}} = \int_{\vec{x}} \dd{\vec{x'}} P(\vec{x'} | \vec{\theta}) x_i
|
||
\end{align}
|
||
So \cref{eq:c:stepbeforesub} is simple:
|
||
\begin{align}
|
||
\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta} + \left(y_i(\theta) - y_i(\theta)\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta} \\
|
||
\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta}
|
||
\end{align}
|
||
Plug this into \cref{eq:c:gbeforesub}, and we get
|
||
\begin{align}
|
||
g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{2 \sigma^2} \left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
|
||
g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{2 \sigma^2} 2 \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta} \\
|
||
g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{\sigma^2} \sum_i \pdv{y_i}{\theta_\alpha} \pdv{y_i}{\theta_\beta} \\
|
||
g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{\sigma^2} \transpose{J}J,
|
||
\end{align}
|
||
as we wanted to show.
|
||
\newpage
|
||
\listoftodos
|
||
|
||
\end{document}
|