sethna/tex/1.15.tex

\documentclass{article}

% set up telugu
\usepackage{fontspec}
\newfontfamily\telugufont{Potti Sreeramulu}[Script = Telugu]
\usepackage{polyglossia}
\setdefaultlanguage{english}
\setotherlanguage{telugu}

%other packages
\usepackage{amsmath}
\usepackage{bm}
\usepackage{amssymb}
\usepackage{physics}
\usepackage{siunitx}
\usepackage{todonotes}
\usepackage{luacode}
\usepackage{titling}
\usepackage{enumitem}

% custom deepak packages
\usepackage{luatrivially}
\usepackage{subtitling}

\usepackage{cleveref}

\begin{luacode*}
	math.randomseed(31415926)
\end{luacode*}

\renewcommand{\vec}{\bm}
\newcommand{\transpose}[1]{#1^T}

\title{Problem 1.15}
\subtitle{Fisher information and Cramér–Rao}
\author{\begin{telugu}హృదయ్ దీపక్ మల్లుభొట్ల\end{telugu}}
% want empty date
\predate{}
\date{}
\postdate{}

% !TeX spellcheck = en_GB
\begin{document}
	\maketitle
	We're gonna look at the least squares model.
	\subsubsection*{(a) Dependence only on parameter distance.}
	A least squares model $y_i(\vec{\theta})$ for N data points $d_i \pm \sigma$ with independent, normally distributed measurement errors predicts a likelihood for finding a value $\vec{x} = \left\{x_i\right\}$ of the data $\left\{d_i\right\}$ given by
	\begin{equation}
		P(\vec{x} | \vec{\theta}) = \frac{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}{\left(2 \pi \sigma^2\right)^{\flatfrac{N}{2}}}
	\end{equation}

	How big is the probability density that a least-squares model with true parameters $\vec{\theta}$ would give experimental results implying a different set of parameters $\vec{\phi}$?
	Show that it depends only on the distance between the vectors $\abs{\vec{y}(\vec{\theta}) - \vec{y}(\vec{\phi})}$ in the prediction space.

	\subsubsection*{(b) Metric}
	Remember that the metric tensor $g_{\alpha\beta}$ gives the distance on the manifold between two nearby points.
	The squared distance between points with coordinates $\vec{\theta}$ and $\vec{\theta} + \epsilon \vec{\Delta}$ is $\epsilon^2 \sum_{\alpha\beta} g_{\alpha\beta} \Delta_\alpha \Delta_\beta$.

	Show that the least-squares metric is
	\begin{equation}
		g_{\alpha\beta} = \frac{\left(\transpose{J}J\right)_{\alpha\beta}}{\sigma^2},
	\end{equation}
	where the Jacobian is
	\begin{equation}
		J_{i\alpha} = \pdv{y_i}{\theta_\alpha}.
	\end{equation}

	\subsubsection*{(c) The Fisher Information Matrix}
	In general, the natural metric is given by the FIM:
	\begin{equation}
		g_{\alpha \beta}(\vec{\theta}) = - \left<\pdv[2]{\log{P(\vec{x} | \vec{\theta} )}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}}
	\end{equation}

	\section{Solution} \label{sec:solution}
	\subsection*{(a) Least squares distance}
	\begin{equation}
		P(\vec{x} | \vec{\theta}) = \frac{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}{\left(2 \pi \sigma^2\right)^{\flatfrac{N}{2}}}
	\end{equation}

	We're interested in $P(\vec{y}(\vec{\phi}) | \vec{\theta})$:
	\begin{align}
		P(\vec{y}(\vec{\phi}) | \vec{\theta}) &= \frac{e^{- \sum_i \flatfrac{\left(y_i(\vec{\phi}) - y_i(\vec{\theta})\right)^2}{2 \sigma^2}}}{\left(2 \pi\sigma^2\right)^{\flatfrac{N}{2}}}. \label{eq:a:pbeforesimple}
	\end{align}
	Look at this:
	\begin{equation}
		\sum_i \left(y_i(\vec{\phi}) - y_i(\vec{\theta})\right)^2.
	\end{equation}
	What even is that expression?
	Well you might recognise it, fool.
	It's just $\sum_i \left(\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})\right)_i^2$, otherwise known as
	\begin{equation}
		\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}^2.
	\end{equation}
	So plug that into \cref{eq:a:pbeforesimple}, and see what happens
	\begin{align}
		P(\vec{y}(\vec{\phi}) | \vec{\theta}) &= \frac{e^{- \flatfrac{\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}^2}{2 \sigma^2}}}{\left(2 \pi\sigma^2\right)^{\flatfrac{N}{2}}} \\
		P(\vec{y}(\vec{\phi}) | \vec{\theta}) &= \frac{e^{-\frac12 \left(\frac{\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}}{\sigma}\right)^2}}{\left(2 \pi\sigma^2\right)^{\flatfrac{N}{2}}}. \label{eq:a:paftersimple}
	\end{align}

	That just depends on $\abs{\frac{\Delta\vec{y}}{\sigma}}$, as we were told to expect.

	\subsection*{(b) Metric tensor}
	Let's start by rescaling
	\begin{align}
		d^2 = \left(\frac{\abs{\vec{y}(\vec{\phi}) - \vec{y}(\vec{\theta})}}{\sigma}\right)^2
	\end{align}
	and look at what this is with $\vec{\phi} = \vec{\theta} + \epsilon \vec{\Delta}$, particularly as $\epsilon \rightarrow 0$.
	\begin{align}
		d^2 &= \left(\frac{\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}}{\sigma}\right)^2
	\end{align}
	What's $\vec{y}(\vec{\theta} + \epsilon \vec{\Delta})$?
	As we expand this, let's use Einstein summation notation.
	To first order in $\epsilon$, this is.
	\begin{align}
		\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) &= \vec{y}(\vec{\theta}) + \epsilon \Delta_\alpha \pdv{\vec{y}(\vec{\theta})}{\theta_\alpha}
	\end{align}
	Now square-magnitude the difference with $\vec{y}(\vec{\theta})$:
	\begin{align}
		\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \sum_i \left(y_i(\vec{\theta} + \epsilon \vec{\Delta}) - y_i(\vec{\theta})\right)^2 \\
		&= \sum_i \left( y_i(\vec{\theta}) + \epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha} - y_i(\vec{\theta})\right)^2 \\
		&= \sum_i \left(\epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right)^2 \\
		&= \sum_{i} \left(\sum_\alpha \epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right)^2
	\end{align}
	Now this is the crucial nub of this stupid index twiddling.
	The sums inside the squared are independent (as we see if we write out a nontrivial example).
	So this really becomes:
	\begin{align}
		\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \sum_{i} \left(\sum_\alpha \epsilon \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right)  \left(\sum_\beta \epsilon \Delta_\beta \pdv{y_i(\vec{\theta})}{\theta_\beta}\right) \\
		\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \epsilon^2 \sum_{i, j} \left(\sum_\alpha \Delta_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right)  \left(\sum_\beta \Delta_\beta \pdv{y_j(\vec{\theta})}{\theta_\beta}\right) \delta_{ij} \\
		\abs{\vec{y}(\vec{\theta} + \epsilon \vec{\Delta}) - \vec{y}(\vec{\theta})}^2 &= \epsilon^2 \left[\sum_{i, j} \left(\sum_\alpha \pdv{y_i(\vec{\theta})}{\theta_\alpha}\right)  \left(\sum_\beta \pdv{y_j(\vec{\theta})}{\theta_\beta}\right) \delta_{ij}\right] \Delta_\alpha \Delta_\beta.
	\end{align}
	The bit in square brackets is just matrix multiplication of our Jacobian with its transpose!
	Comparing with our definition of the metric tensor and switching to scaled units, we have our result.

	\subsection*{(c) FIM}
	The FIM:
	\begin{align}
		g_{\alpha \beta}(\vec{\theta}) &= - \left<\pdv[2]{\log{P(\vec{x} | \vec{\theta} )}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
		g_{\alpha \beta}(\vec{\theta}) &= - \left<\pdv[2]{\log{\frac{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}{\left(2 \pi \sigma^2\right)^{\flatfrac{N}{2}}}}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
		g_{\alpha \beta}(\vec{\theta}) &= - \left<\pdv[2]{\log{e^{- \sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
		g_{\alpha \beta}(\vec{\theta}) &= - \left<- \pdv[2]{\sum_i \flatfrac{\left(y_i(\theta) - x_i\right)^2}{2 \sigma^2}}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
		g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{2 \sigma^2} \left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \label{eq:c:gbeforesub}
	\end{align}
	Let's look at the first derivative:
	\begin{align}
		\pdv{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\beta} &= \sum_i \pdv{\left(y_i(\theta) - x_i\right)^2}{\theta_\beta} \\
		\pdv{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\beta} &= \sum_i 2 \left(y_i(\theta) - x_i\right) \pdv{}{\theta_\beta} \left( y_i(\theta) - x_i \right) \\
		\pdv{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\beta} &= 2 \sum_i \left(y_i(\theta) - x_i\right) \pdv{y_i}{\theta_\beta}
	\end{align}
	And the second derivative:
	\begin{align}
		\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} &= \pdv{}{\theta_\alpha} 2 \sum_i \left(y_i(\theta) - x_i\right) \pdv{y_i}{\theta_\beta} \\
		\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} &= 2 \sum_i \left(\pdv{}{\theta_\alpha} \left(y_i(\theta) - x_i\right) \right) \pdv{y_i}{\theta_\beta}  + \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta} \\
		\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} &= 2 \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta}  + \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta}
	\end{align}
	Now average that over $\vec{x}$:
	\begin{align}
		\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= \left< 2 \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta}  + \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta}\right>_{\vec{x}} \\
		\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta}  +  \left< \left(y_i(\theta) - x_i\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta}\right>_{\vec{x}} \\
		\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta}  + \left(y_i(\theta) - \left<x_i \right>_{\vec{x}}\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta} \label{eq:c:stepbeforesub}
	\end{align}
	What is $\left<x_i\right>_{\vec{x}}$?
	Nothing but $y_i(\theta)$!
	If that seems confusing, remember that the average looks like:
	\begin{align}
		\left<x_i\right>_{\vec{x}} = \int_{\vec{x}} \dd{\vec{x'}} P(\vec{x'} | \vec{\theta}) x_i
	\end{align}
	So \cref{eq:c:stepbeforesub} is simple:
	\begin{align}
		\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta}  + \left(y_i(\theta) - y_i(\theta)\right) \pdv[2]{y_i}{\theta_\alpha}{\theta_\beta} \\
		\left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} &= 2 \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta}
	\end{align}
	Plug this into \cref{eq:c:gbeforesub}, and we get
	\begin{align}
		g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{2 \sigma^2} \left<\pdv[2]{\sum_i \left(y_i(\theta) - x_i\right)^2}{\theta_\alpha}{\theta_\beta} \right>_{\vec{x}} \\
		g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{2 \sigma^2} 2 \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta} \\
		g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{\sigma^2} \sum_i \pdv{y_i}{\theta_\alpha}  \pdv{y_i}{\theta_\beta} \\
		g_{\alpha \beta}(\vec{\theta}) &= \frac{1}{\sigma^2} \transpose{J}J,
	\end{align}
	as we wanted to show.
	\newpage
	\listoftodos

\end{document}