0 | {--
   1 | Copyright (C) 2021  Joel Berkeley
   2 | 
   3 | This program is free software: you can redistribute it and/or modify
   4 | it under the terms of the GNU Affero General Public License as published
   5 | by the Free Software Foundation, either version 3 of the License, or
   6 | (at your option) any later version.
   7 | 
   8 | This program is distributed in the hope that it will be useful,
   9 | but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 | GNU Affero General Public License for more details.
  12 | 
  13 | You should have received a copy of the GNU Affero General Public License
  14 | along with this program.  If not, see <https://www.gnu.org/licenses/>.
  15 | --}
  16 | ||| Gaussian process inference.
  17 | module Model.GaussianProcess
  18 | 
  19 | import Control.Relation
  20 | import Data.Nat
  21 | 
  22 | import Constants
  23 | import Distribution
  24 | import Data
  25 | import Model
  26 | import Model.Kernel
  27 | import Model.MeanFunction
  28 | import Optimize
  29 | import Tensor
  30 | 
  31 | ||| A Gaussian process is a collection of random variables, any finite number of which have joint
  32 | ||| Gaussian distribution. It can be viewed as a function from a feature space to a joint Gaussian
  33 | ||| distribution over a target space.
  34 | |||
  35 | ||| @features The shape of the feature domain.
  36 | public export
  37 | data GaussianProcess : (0 features : Shape) -> Type where
  38 |   ||| Construct a `GaussianProcess` as a pair of mean function and kernel.
  39 |   MkGP : MeanFunction features -> Kernel features -> GaussianProcess features
  40 | 
  41 | posterior :
  42 |   GaussianProcess features ->
  43 |   Tensor [] F64 ->
  44 |   {s : _} -> (Tensor ((S s) :: features) F64, Tensor [S s] F64) ->
  45 |   Tag $ GaussianProcess features
  46 | posterior (MkGP priorMeanf priorKernel) noise (xTrain, yTrain) = do
  47 |   l <- tag !(cholesky $ !(priorKernel xTrain xTrain) + noise * identity)
  48 |   let alpha = l.T \| (l |\ yTrain)
  49 | 
  50 |       posteriorMeanf : MeanFunction features
  51 |       posteriorMeanf x = pure $ !(priorMeanf x) + !(priorKernel x xTrain) @@ alpha
  52 | 
  53 |       posteriorKernel : Kernel features
  54 |       posteriorKernel x x' = pure
  55 |         $ !(priorKernel x x') - (l |\ !(priorKernel xTrain x)).T @@ (l |\ !(priorKernel xTrain x'))
  56 | 
  57 |   pure $ MkGP posteriorMeanf posteriorKernel
  58 | 
  59 | logMarginalLikelihood :
  60 |   GaussianProcess features ->
  61 |   Tensor [] F64 ->
  62 |   {s : _} -> (Tensor ((S s) :: features) F64, Tensor [S s] F64) ->
  63 |   Tag $ Tensor [] F64
  64 | logMarginalLikelihood (MkGP _ kernel) noise (x, y) = do
  65 |   l <- tag !(cholesky (!(kernel x x) + noise * identity))
  66 |   let alpha = l.T \| (l |\ y)
  67 |   pure $ - y @@ alpha / 2.0 - !(trace (log l)) - fromDouble (cast (S s)) * log (2.0 * pi) / 2.0
  68 | 
  69 | ||| A trainable model implementing vanilla Gaussian process regression. That is, regression with a
  70 | ||| Gaussian process as conjugate prior for homoscedastic Gaussian likelihoods. See the following
  71 | ||| for details:
  72 | |||
  73 | ||| Gaussian Processes for Machine Learning
  74 | ||| Carl Edward Rasmussen and Christopher K. I. Williams
  75 | ||| The MIT Press, 2006. ISBN 0-262-18253-X.
  76 | |||
  77 | ||| or
  78 | |||
  79 | ||| Pattern Recognition and Machine Learning, Christopher M. Bishop
  80 | public export
  81 | data ConjugateGPRegression : (0 features : Shape) -> Type where
  82 |   ||| @gpFromHyperparameters Constructs a Gaussian process from the hyperparameters (presented as
  83 |   |||   a vector)
  84 |   ||| @hyperparameters The hyperparameters (excluding noise) presented as a vector.
  85 |   ||| @noise The likehood amplitude, or observation noise.
  86 |   MkConjugateGPR :
  87 |     {p : _} ->
  88 |     (gpFromHyperparameters : Tensor [p] F64 -> Tag $ GaussianProcess features) ->
  89 |     (hyperparameters : Tensor [p] F64) ->
  90 |     (noise : Tensor [] F64) ->
  91 |     ConjugateGPRegression features
  92 | 
  93 | ||| A probabilistic model from feature values to a distribution over latent target values.
  94 | export
  95 | [Latent] ProbabilisticModel features [1] Gaussian (ConjugateGPRegression features) where
  96 |   marginalise (MkConjugateGPR mkGP gpParams _) x = do
  97 |     MkGP meanf kernel <- mkGP gpParams
  98 |     [| MkGaussian (expand 1 <$> meanf x) (expand 2 <$> kernel x x) |]
  99 | 
 100 | ||| A probabilistic model from feature values to a distribution over observed target values.
 101 | export
 102 | [Observed] ProbabilisticModel features [1] Gaussian (ConjugateGPRegression features) where
 103 |   marginalise gpr@(MkConjugateGPR _ _ noise) x = do
 104 |     MkGaussian latentMean latentCov <- marginalise @{Latent} gpr x
 105 |     pure $ MkGaussian latentMean $ latentCov + broadcast (expand 2 (noise * identity {n = S n}))
 106 | 
 107 | ||| Fit the Gaussian process and noise to the specified data.
 108 | export
 109 | fit : (forall n . Tensor [n] F64 -> Optimizer $ Tensor [n] F64)
 110 |   -> Dataset features [1]
 111 |   -> ConjugateGPRegression features
 112 |   -> Tag $ ConjugateGPRegression features
 113 | fit optimizer (MkDataset x y) (MkConjugateGPR {p} mkPrior gpParams noise) = do
 114 |   let objective : Tensor [S p] F64 -> Tag $ Tensor [] F64
 115 |       objective params = do
 116 |         let priorParams = slice [1.to (S p)] params
 117 |         logMarginalLikelihood !(mkPrior priorParams) (slice [at 0] params) (x, squeeze y)
 118 | 
 119 |   params <- optimizer (concat 0 (expand 0 noise) gpParams) objective
 120 | 
 121 |   let mkPosterior : Tensor [p] F64 -> Tag $ GaussianProcess features
 122 |       mkPosterior params' = posterior !(mkPrior params') (squeeze noise) (x, squeeze y)
 123 | 
 124 |   pure $ MkConjugateGPR mkPosterior (slice [1.to (S p)] params) (slice [at 0] params)
 125 | 
 126 |     where
 127 |     %hint
 128 |     reflexive : {n : _} -> LTE n n
 129 |     reflexive = Relation.reflexive {ty = Nat}
 130 |