# +
# Copyright (c) 2020 Urbain Vaes. All rights reserved.
#
# This work is licensed under the terms of the MIT license.
# For a copy, see <https://opensource.org/licenses/MIT>.
import scipy.stats
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import scipy.interpolate
import scipy.optimize
import scipy.integrate
# -

# +
matplotlib.rc('font', size=20)
matplotlib.rc('font', family='serif')
matplotlib.rc('figure', figsize=(13, 8))
matplotlib.rc('lines', linewidth=2)
matplotlib.rc('lines', markersize=12)
matplotlib.rc('figure.subplot', hspace=.1)
matplotlib.rc('animation', html='html5')
np.random.seed(0)
# -
# # Problem 2
# In this problem $X_t$ is the solution to the Ornstein-Uhlenbeck equation:
# \begin{align}
# \newcommand{\e}{\mathrm e}
# \newcommand{\d}{\mathrm d}
# \newcommand{\expect}{\mathbb E}
# \newcommand{\var}{\mathrm{Var}}
# \d X_t = - \theta (X_t - \mu) \, \d t + \sigma \, \d W_t.
# \end{align}

# ## Question 1
# Without the noise term, the solution would be $X_t = \mu + (X_0 - \mu) \, \e^{- \theta t}$.
# This suggests using Itô's formula with $Y_t = f(X_t, t) = \e^{\theta t}(X_t - \mu)$,
# which gives
# \begin{align}
# \d Y_t = \e^{\theta t} \, \theta \, (X_t - \mu) + \e^{\theta t} \, (- \theta \, (X_t - \mu) \, \d t + \sigma \, \d W_t)
#        = \e^{\theta t} \, \sigma \,  \d W_t.
# \end{align}
# This implies
# $$
# Y_t = Y_0 + \int_0^{t} \e^{\theta s} \, \sigma \, \d W_s = (X_0 - \mu) + \int_0^t \e^{\theta s} \, \sigma \, \d W_s,
# $$
# which gives
# $$
# X_t = \mu + (X_0 - \mu) \, \e^{-\theta \, t} + \sigma \, \int_0^t \e^{-\theta(t - s)} \, \d W_s.
# $$
# By Itô's isometry, the second moment at the final time is given by
# $$
# \begin{aligned}
# \expect |X_T|^2 &= |\expect[X_T]|^2 + \var[X_T] = |\mu + (\expect[X_0] - \mu) \, \e^{-\theta \, t}|^2 + \var[X_0] \, \e^{-2 \theta t} + \sigma^2 \, \int_0^T \e^{-2 \theta(T - s)} \, \d s \\
#                 &= \left|-1 + 2.5 \e^{-1} \right|^2 + \frac{1}{12} \, \e^{-2} + (1 - \e^{-2}) = 0.882...
# \end{aligned}
# $$

# ## Question 2
# Since
# $$
# I - I_N = \int_0^T (f(s) - f_N(s)) \, \d W_s,
# $$
# Itô's isometry implies
# $$
# \expect |I_N - I|^2 = \expect \left[ \int_0^T |f(s) - f_N(s)|^2 \, \d s \right].
# $$
# The right-hand side converges to zero as $N \to \infty$ because $f_N \to f$ in $L^2([0, T])$.
# Consequently, the left-hand side also converges to zero as $N \to \infty$,
# i.e. $I_N \to I$ in $L^2(\Omega)$ and thus also in distribution.

# On the other-hand,
# $$
# I_N = \sum_{k=0}^{N-1} f(k \Delta_N) \, (W_{(k+1) \Delta_N} - W_{k \Delta_N}).
# $$
# Since the right-hand side is a sum of independent normally distributed random variables,
# for any $N$ the integral $I_N$ is a normally distributed random variable.
# The mean of $I_N$ is 0 and the variance is
# $$
# \var[I_N] = \sum_{k=0}^{N-1} |f(k \Delta_N)|^2 \, \Delta_n = \int_0^T |f_N(s)|^2 \, \d s.
# $$
# Since $f_N \to f$ in $L^2([0, T])$ as $N \to \infty$,
# we deduce that $\var[I_N] \to \int_0^T |f(s)|^2 \, \d s$ as $N \to \infty$.
# Therefore $I_N \to \mathcal N\left(0, \int_0^T |f(s)|^2 \, \d s\right)$ in distribution as $N \to \infty$.

# By uniqueness of the limit in distribution, we conclude
# $$
# I \sim \mathcal N \left(0, \int_0^T |f(s)|^2 \, \d s\right).
# $$

# ## Question 3

# Generalizing slightly the result in Question 1, we obtain that
# $$
# X_t = \mu + (X_s - \mu) \, \e^{-\theta(t - s)} + \sigma \, \int_s^t \e^{-\theta(t - u)} \, \d W_u,
# $$
# for any two times $0 \leq s \leq t$.
# Letting $s = n \Delta t$ and $t = (n+1) \Delta t$,
# this gives
# $$
# X_{(n+1)\Delta t} = \mu + (X_{n\Delta t} - \mu) \, \e^{-\theta \Delta t} + \sigma \, \int_{n\Delta t}^{(n+1)\Delta t} \e^{-\theta((n+1)\Delta t - u)} \, \d W_u.
# $$
# Using the result in Question 2, we deduce that the Itô integral has variance
# $$
# \int_{n\Delta t}^{(n+1)\Delta t} \e^{-\theta((n+1)\Delta t - u)} \, \d W_u \sim \mathcal N \left(0, \int_{n\Delta t}^{(n+1)\Delta t} \e^{-2\theta((n+1)\Delta t - u)}\, \d u \right).
# $$
# The integral corresponding to the variance can be calculated explicitly:
# $$
# \int_{0}^{\Delta t} \e^{-2\theta(\Delta t - u)}\, \d u = \int_{0}^{\Delta t} \e^{-2\theta u}\, \d u = \frac{1}{2 \theta} \, (1 - \e^{-2 \theta \Delta t}).
# $$
# We have thus shown that the exact solution satisfies
# $$
# X_{(n+1) \, \Delta t} = \mu + a(\Delta t) \, (X_{n\Delta t} - \mu) + b(\Delta t) \, \xi \qquad \text{in law,}
# $$
# where $\xi \sim \mathcal N(0, 1)$, $a(\Delta t) = \e^{-2 \theta \Delta t}$ and $b(\Delta t) = \sqrt{\frac{\sigma^2}{2 \theta} \, (1 - \e^{-2 \theta \Delta t})}$.
# This suggests the numerical scheme
# $$
# X^{\Delta t}_{n+1} = \mu + a(\Delta t) \, (X^{\Delta t}_{n} - \mu) + b(\Delta t) \, \xi.
# $$
# Since successive iterates obey the same relationship as the exact solution,
# it is clear that the associated weak error is zero:
# for any $n$, the PDF of $X^{\Delta t}_n$ is tha same as that of $X_{n \Delta t}$,
# and similarly for all the finite-dimensional distributions.

# ## Question 4
# Since, in law,
# $$
# X_T = \mu + a(1) \, (X_0 - \mu) + b(1) \, \xi,
# $$
# we calculate
# $$
# \expect |X_T|^2 = \expect|\mu + a(1) \, (X_0 - \mu)|^2 + |b(1)|^2 = |\mu + a(1) \, (\expect[X_0] - \mu)|^2 + |a(1)|^2 \, \var[X_0] + |b(1)|^2.
# $$

# +
# Global, fixed parameters
μ, σ = -1, np.sqrt(2)

# True value of drift
true_θ = 1

def coefficients(θ, Δt):
    a = np.exp(-θ*Δt)
    b = np.sqrt(σ**2/2/θ * (1 - np.exp(-2*θ*Δt)))
    return a, b

def generate_trajectory(N, M, Δt):

    # Noise
    ξ = np.random.randn(N, M)

    # Solution
    x = np.zeros((N + 1, M))
    x0 = 1 + np.random.rand(M)
    x[0] = x0

    # Coefficients of the numerical method
    a, b = coefficients(true_θ, Δt)

    # Time-stepping
    for j in range(N):
        x[j + 1] = μ + a * (x[j] - μ) + b * ξ[j]

    return x

# Exact second moment
a, b = coefficients(true_θ, 1)
mean_x0, var_x0 = 1.5, 1/12
exact_second_moment = (μ + a*(mean_x0 - μ))**2 + a**2*var_x0 + b**2.
print("Exact value: {}".format(exact_second_moment))

N, M, Δt = 100, 10**5, .01
x = generate_trajectory(N, M, Δt)
fx = x[-1]**2

# Here, the variance can be calculated either explicitly based on the exact
# solution, or approximately based on the data.
σf = np.sqrt(np.var(fx))

# Estimator and confidence interval
estimator = np.mean(fx)

left_99 = estimator + scipy.stats.norm.ppf(.005) * σf/np.sqrt(M)
right_99 = estimator + scipy.stats.norm.ppf(.995) * σf/np.sqrt(M)
print("99% confidence interval: [{}, {}]".format(left_99, right_99))
# -

# +
fix, ax = plt.subplots()
t = np.linspace(0, N*Δt, N + 1)
ax.plot(t, x[:, :20])
ax.set_xlabel('$t$')
plt.show()
# -
# ## Question 5
# Given $X^{\Delta t}_n$, the conditional distribution of $X^{\Delta t}_{n+1}$
# is Gaussian with mean $\mu + a(\Delta t) \, (X^{\Delta t}_n - \mu)$ and variance $|b(\Delta t)|^2$.
# Therefore
# $$
# f_{\hat X} (x_{0}, \dotsc, x_{N} \, ; \, \vartheta) = I_{[1, 2]}(x_{0}) \, \left(\frac{1}{\sqrt{2 \pi \, |b(\Delta t)|^2}}\right)^N \, \prod_{k=0}^{N-1} \exp \left(- \frac{\left(\mu + a(\Delta t) \, (x_k - \mu) - x_{k+1}\right)^2}{2 \, |b(\Delta t)|^2}\right),
# $$
# where in fact, on the right-hand side $a(\Delta t) = a(\Delta t; \vartheta)$ and $b(\Delta t) = b(\Delta t; \vartheta)$,
# but we do not write this explicitly, in order to keep the notations concise.

# ## Questions 6 and 7
# For Question 7, the joint probability distribution function is given by
# $$
# f_{\theta, \hat X} (\vartheta, x_{0}, \dotsc, x_{N}) = g_{2,1}(\vartheta) \, I_{[1, 2]}(x_{0}) \, \left(\frac{1}{\sqrt{2 \pi \, |b(\Delta t)|^2}}\right)^N \, \prod_{k=0}^{N-1} \exp \left(- \frac{\left(\mu + a(\Delta t) \, (x_k - \mu) - x_{k+1}\right)^2}{2 \, |b(\Delta t)|^2}\right),
# $$
# where $g_{2,1}$ is the PDF of $\mathcal N(2, 1)$.
# The conditional probability distribution $f_{\theta, \hat X}$ is given by
# $$
# f_{\theta|\hat X}(\vartheta|x_0, \dotsc, x_N) = \frac{f_{\theta, \hat X} (\vartheta, x_{0}, \dotsc, x_{N})}{\int_{\mathbb R} f_{\theta, \hat X} (\vartheta, x_{0}, \dotsc, x_{N}) \, \d \vartheta}.
# $$

# +
# Log-likelihood up to a constant
def log_likelihood(x, θ, include_prior):
    # Number of terms in the sum
    N = len(x) - 1

    # Coefficients of the numerical method
    a, b = coefficients(θ, Δt)

    # Calculation of the log-likelihood
    logL = - N*np.log(b) - np.sum((μ + a*(x[:-1] - μ) - x[1:])**2)/2/b**2

    # Contribution of the prior
    prior_contribution = - (θ - 2)**2/2

    return logL + include_prior*prior_contribution

# Inference
N, M, Δt = 10**6, 1, .1
t = np.linspace(0, N*Δt, N + 1)
x = generate_trajectory(N, M, Δt)[:,0]

# Range where we suspect the estimators will be
θmin, θmax = .97, 1.03

# Useful functions
fmin = scipy.optimize.fminbound
finterp = scipy.interpolate.interp1d
fsolve = scipy.integrate.solve_ivp
fzeros = scipy.optimize.brentq

# ML and MAP estimators
θmle = fmin(lambda θ: - log_likelihood(x, θ, False), θmin, θmax)
θmap = fmin(lambda θ: - log_likelihood(x, θ, True), θmin, θmax)
print(θmle, θmap)
# -
# ## Question 8

# +
# Approximation of the PDF for θ conditional on the data. We calcuate the
# log-likelihood over 200 points in the interval and interpolate between.
n = 200
θs = np.linspace(θmin, θmax, n)
Lθ = np.zeros(n)

for i, θ in enumerate(θs):
    Lθ[i] = log_likelihood(x, θ, True)

# To bring the likelihood to values of order one, we subtract the value of
# log-likelihood at the MAP.
pdf = np.exp(Lθ - log_likelihood(x, θmap, True))
pdf = finterp(θs, pdf, kind='linear', fill_value='extrapolate')

# Calculate the CDF based on the PDF
cdf = fsolve(fun=lambda θ, _: pdf(θ), t_span=[θmin, θmax],
             y0=[0], t_eval=θs, atol=1e-15, rtol=1e-13).y[0]
cdf = finterp(θs, cdf/cdf[-1], kind='linear', fill_value='extrapolate')

# Calculate 99% confidence interval
left_99 = fzeros(lambda x: .005 - cdf(x), θmin, θmax)
right_99 = fzeros(lambda x: .995 - cdf(x), θmin, θmax)

# Plot the estimators
fig, ax = plt.subplots()
height = pdf(θmap) * 1.1
ax.vlines([left_99, right_99], ymin=0, ymax=height,
          color='r', linestyle='--', label='99% conf. int.')
ax.vlines([θmap], ymin=0, ymax=height, color='k', label='MAP')
ax.vlines([θmle], ymin=0, ymax=height, color='g', label='MLE')
ax.set_xlabel(r"$\theta$")
ax.set_xlim(θmin, θmax)
ax.set_ylim(0, height)
ax.plot(θs, pdf(θs), label='PDF (up to normalization)')
ax.plot(θs, cdf(θs), label='CDF')
ax.legend()
plt.show()
# -