Optimization Tutorial

Trey V. Wenger (c) July 2025

Here we demonstrate how to optimize the number of cloud components in a EmissionAbsorptionPhysicalModel model.

[1]:
# General imports
import time

import matplotlib.pyplot as plt
import arviz as az
import pandas as pd
import numpy as np
import pymc as pm

print("pymc version:", pm.__version__)
print("arviz version:", az.__version__)

import bayes_spec
print("bayes_spec version:", bayes_spec.__version__)

import caribou_hi
print("caribou_hi version:", caribou_hi.__version__)

# Notebook configuration
pd.options.display.max_rows = None
pymc version: 5.22.0
arviz version: 0.22.0dev
bayes_spec version: 1.9.0
caribou_hi version: 4.1.0+3.g94a913e.dirty

Model Definition and Simulated Data

[2]:
from bayes_spec import SpecData

# spectral axes definitions
emission_axis = np.linspace(-60.0, 60.0, 200)  # km s-1
absorption_axis = np.linspace(-30.0, 30.0, 100)  # km s-1

# data noise can either be a scalar (assumed constant noise across the spectrum)
# or an array of the same length as the data
rms_emission = 0.1  # K
rms_absorption = 0.001  # 1 - exp(-tau)

# brightness data. In this case, we just throw in some random data for now
# since we are only doing this in order to simulate some actual data.
emission = rms_emission * np.random.randn(len(emission_axis))
absorption = rms_absorption * np.random.randn(len(absorption_axis))

dummy_data = {
    "emission": SpecData(
        emission_axis,
        emission,
        rms_emission,
        xlabel=r"$V_{\rm LSR}$ (km s$^{-1}$)",
        ylabel=r"$T_B$ (K)",
    ),
    "absorption": SpecData(
        absorption_axis,
        absorption,
        rms_absorption,
        xlabel=r"$V_{\rm LSR}$ (km s$^{-1}$)",
        ylabel=r"1 - exp(-$\tau$)",
    ),
}

# Plot dummy data
fig, axes = plt.subplots(2, layout="constrained", sharex=True)
axes[0].plot(dummy_data["emission"].spectral, dummy_data["emission"].brightness, "k-")
axes[0].plot(dummy_data["emission"].spectral, dummy_data["emission"].noise, "r-")
axes[1].plot(dummy_data["absorption"].spectral, dummy_data["absorption"].brightness, "k-", label="Data")
axes[1].plot(dummy_data["absorption"].spectral, dummy_data["absorption"].noise, "r-", label="Noise")
axes[1].set_xlabel(dummy_data["emission"].xlabel)
axes[0].set_ylabel(dummy_data["emission"].ylabel)
axes[1].set_ylabel(dummy_data["absorption"].ylabel)
_ = axes[1].legend(loc="lower left")
../_images/notebooks_optimization_3_0.png
[3]:
from caribou_hi import EmissionAbsorptionPhysicalModel

# Initialize and define the model
n_clouds = 3
baseline_degree = 0
model = EmissionAbsorptionPhysicalModel(
    dummy_data,
    n_clouds=n_clouds,
    baseline_degree=baseline_degree,
    bg_temp = 3.77, # assumed background temperature (K)
    seed=1234,
    verbose=True
)
model.add_priors(
    prior_filling_factor=[2.0, 1.0],  # filling factor prior shape
    prior_ff_NHI=1.0e21,  # filling factor * column density prior width (cm-2)
    prior_fwhm2_thermal_fraction=[2.0, 2.0], # thermal FWHM^2 fraction prior shape
    prior_sigma_log10_NHI=0.5, # log-normal column density distribution width
    prior_fwhm2=200.0, # FWHM^2 prior width (km2 s-2)
    prior_velocity=[-15.0, 15.0],  # lower and upper limit of velocity prior (km/s)
    prior_log10_n_alpha=[-6.0, 1.0],  # log10(n_alpha) prior width (cm-3)
    prior_nth_fwhm_1pc=[1.75, 0.25], # non-thermal FWHM at 1 pc prior mean and width (km s-1)
    prior_depth_nth_fwhm_power=[0.3, 0.1], # non-thermal FWHM vs. depth power prior mean and width
    prior_fwhm_L=None, # Assume Gaussian line profile
    prior_baseline_coeffs=None, # Default baseline priors
)
model.add_likelihood()
[4]:
from caribou_hi import physics

# Simulation parameters
filling_factor = np.array([0.2, 0.8, 1.0])
absorption_weight = np.array([0.9, 0.8, 1.0])
log10_nHI = np.array([1.25, 0.0, -0.5])
tkin = np.array([50.0, 300.0, 5000.0])
n_alpha = np.array([1.0e-6, 2.0e-6, 3.0e-6])
depth = np.array([5.0, 25.0, 300.0])
nth_fwhm_1pc = np.array([2.0, 1.75, 1.5])
depth_nth_fwhm_power = np.array([0.2, 0.3, 0.4])

tspin = physics.calc_spin_temp(tkin, 10.0**log10_nHI, n_alpha).eval()
print("tspin", tspin)

fwhm2_nonthermal = physics.calc_nonthermal_fwhm(depth, nth_fwhm_1pc, depth_nth_fwhm_power)**2.0
print("fwhm2_nonthermal", fwhm2_nonthermal)

fwhm2_thermal = physics.calc_thermal_fwhm2(tkin)
print("fwhm2_thermal", fwhm2_thermal)

fwhm2 = fwhm2_nonthermal + fwhm2_thermal
print("fwhm2", fwhm2)

fwhm2_thermal_fraction = fwhm2_thermal/fwhm2
print("fwhm2_thermal_fraction", fwhm2_thermal_fraction)

log10_Pth = np.log10(tkin) + log10_nHI
print("log10_Pth", log10_Pth)

log10_NHI = log10_nHI + np.log10(depth) + 18.489351
print("log10_NHI", log10_NHI)

tau_total = physics.calc_tau_total(10.0**log10_NHI, tspin)
print("tau_total", tau_total)

#wt_ff_tspin = absorption_weight / filling_factor / tspin
#print("wt_ff_tspin", wt_ff_tspin)
wt_ff_fwhm2_thermal = absorption_weight / filling_factor / fwhm2_thermal
print("wt_ff_fwhm2_thermal", wt_ff_fwhm2_thermal)

sim_params = {
    "wt_ff_fwhm2_thermal": wt_ff_fwhm2_thermal,
    "absorption_weight": absorption_weight,
    "filling_factor": filling_factor,
    "log10_NHI": log10_NHI,
    "tau_total": tau_total,
    "tspin": tspin,
    "fwhm2": fwhm2,
    "fwhm2_thermal_fraction": fwhm2_thermal_fraction,
    "velocity": np.array([-5.0, 5.0, 10.0]),
    "n_alpha": n_alpha,
    "nth_fwhm_1pc": nth_fwhm_1pc,
    "depth_nth_fwhm_power": depth_nth_fwhm_power,
    "baseline_emission_norm": [0.0],
    "baseline_absorption_norm": [0.0],
}

# add derived quantities to sim_params
for key in model.cloud_deterministics:
    if key not in sim_params.keys():
        sim_params[key] = model.model[key].eval(sim_params, on_unused_input="ignore")

# Evaluate and save simulated observation
emission = model.model["emission"].eval(sim_params, on_unused_input="ignore")
absorption = model.model["absorption"].eval(sim_params, on_unused_input="ignore")

data = {
    "emission": SpecData(
        emission_axis,
        emission,
        rms_emission,
        xlabel=r"$V_{\rm LSR}$ (km s$^{-1}$)",
        ylabel=r"$T_B$ (K)",
    ),
    "absorption": SpecData(
        absorption_axis,
        absorption,
        rms_absorption,
        xlabel=r"$V_{\rm LSR}$ (km s$^{-1}$)",
        ylabel=r"1 - exp(-$\tau$)",
    ),
}
tspin [  49.97911226  298.78900835 4259.26982821]
fwhm2_nonthermal [  7.61461575  21.12711044 215.71459099]
fwhm2_thermal [  2.2876605  13.725963  228.76605  ]
fwhm2 [  9.90227625  34.85307344 444.48064099]
fwhm2_thermal_fraction [0.2310237 0.3938236 0.5146817]
log10_Pth [2.94897    2.47712125 3.19897   ]
log10_NHI [20.438321   19.88729101 20.46647225]
tau_total [3.01218479 0.14166923 0.03771258]
wt_ff_fwhm2_thermal [1.9670751  0.07285463 0.00437128]
[5]:
sim_params
[5]:
{'wt_ff_fwhm2_thermal': array([1.9670751 , 0.07285463, 0.00437128]),
 'absorption_weight': array([0.9, 0.8, 1. ]),
 'filling_factor': array([0.2, 0.8, 1. ]),
 'log10_NHI': array([20.438321  , 19.88729101, 20.46647225]),
 'tau_total': array([3.01218479, 0.14166923, 0.03771258]),
 'tspin': array([  49.97911226,  298.78900835, 4259.26982821]),
 'fwhm2': array([  9.90227625,  34.85307344, 444.48064099]),
 'fwhm2_thermal_fraction': array([0.2310237, 0.3938236, 0.5146817]),
 'velocity': array([-5.,  5., 10.]),
 'n_alpha': array([1.e-06, 2.e-06, 3.e-06]),
 'nth_fwhm_1pc': array([2.  , 1.75, 1.5 ]),
 'depth_nth_fwhm_power': array([0.2, 0.3, 0.4]),
 'baseline_emission_norm': [0.0],
 'baseline_absorption_norm': [0.0],
 'log10_n_alpha': array([-5.75395025, -6.4990086 , -5.89087838]),
 'fwhm2_thermal': array([  2.2876605,  13.725963 , 228.76605  ]),
 'tkin': array([  50.,  300., 5000.]),
 'fwhm2_nonthermal': array([  7.61461575,  21.12711044, 215.71459099]),
 'log10_depth': array([0.69897   , 1.39794001, 2.47712125]),
 'log10_nHI': array([ 1.25,  0.  , -0.5 ]),
 'log10_Pth': array([2.94897   , 2.47712125, 3.19897   ]),
 'log10_Pnth': array([4.95888799, 4.1520801 , 4.66111952]),
 'log10_Ptot': array([4.96311227, 4.16116407, 4.67585106])}
[6]:
# Plot data
fig, axes = plt.subplots(2, layout="constrained", sharex=True)
axes[0].plot(data["emission"].spectral, data["emission"].brightness, "k-")
axes[0].plot(data["emission"].spectral, data["emission"].noise, "r-")
axes[1].plot(data["absorption"].spectral, data["absorption"].brightness, "k-", label="Data")
axes[1].plot(data["absorption"].spectral, data["absorption"].noise, "r-", label="Noise")
axes[1].set_xlabel(data["emission"].xlabel)
axes[0].set_ylabel(data["emission"].ylabel)
axes[1].set_ylabel(data["absorption"].ylabel)
_ = axes[1].legend(loc="upper left")
../_images/notebooks_optimization_7_0.png

Optimize

We use the Optimize class for optimization.

[7]:
from bayes_spec import Optimize

# Initialize optimizer
opt = Optimize(
    EmissionAbsorptionPhysicalModel,  # model definition
    data,  # data dictionary
    max_n_clouds=5,  # maximum number of clouds
    baseline_degree=baseline_degree,  # polynomial baseline degree
    bg_temp=3.77,  # assumed background brightness temperature (K)
    seed=1234,  # random seed
    verbose=True,  # verbosity
)

# Define each model
opt.add_priors(
    prior_filling_factor=[2.0, 1.0],  # filling factor prior shape
    prior_ff_NHI=1.0e21,  # filling factor * column density prior width (cm-2)
    prior_fwhm2_thermal_fraction=[2.0, 2.0], # thermal FWHM^2 fraction prior shape
    prior_sigma_log10_NHI=0.5, # log-normal column density distribution width
    prior_fwhm2=200.0, # FWHM^2 prior width (km2 s-2)
    prior_velocity=[-15.0, 15.0],  # lower and upper limit of velocity prior (km/s)
    prior_log10_n_alpha=[-6.0, 1.0],  # log10(n_alpha) prior width (cm-3)
    prior_nth_fwhm_1pc=[1.75, 0.25], # non-thermal FWHM at 1 pc prior mean and width (km s-1)
    prior_depth_nth_fwhm_power=[0.3, 0.1], # non-thermal FWHM vs. depth power prior mean and width
    prior_fwhm_L=None, # Assume Gaussian line profile
    prior_baseline_coeffs=None, # Default baseline priors
)
opt.add_likelihood()

Optimize has created max_n_clouds models, where opt.models[1] has n_clouds=1, opt.models[2] has n_clouds=2, etc.

[8]:
print(opt.models[4])
print(opt.models[4].n_clouds)
<caribou_hi.emission_absorption_physical_model.EmissionAbsorptionPhysicalModel object at 0x716ae0200c30>
4

By default (approx=True), the optimization algorithm first loops over every model and approximates the posterior distribution using variational inference. This is generally a bad idea, since VI is only an approximation and tends to struggle with complex models. Instead we use approx=False to sample every model with MCMC. This is slower but more robust.

We can supply arguments to fit and sample via dictionaries. Whichever model is the first to have a BIC within bic_threshold of the minimum BIC is the “best” model. The algorithm will terminate early if successive models have increasing BICs or fail to converge.

[9]:
fit_kwargs = {
    "rel_tolerance": 0.005,
    "abs_tolerance": 0.005,
    "learning_rate": 0.001,
}
sample_kwargs = {
    "chains": 8,
    "cores": 8,
    "n_init": 200_000,
    "init_kwargs": fit_kwargs,
    "nuts_kwargs": {"target_accept": 0.9},
}
solve_kwargs = {
    "init_params": "random_from_data",
    "n_init": 10,
    "max_iter": 1_000,
    "kl_div_threshold": 0.1,
}
opt.optimize(
    bic_threshold=10.0,
    sample_kwargs=sample_kwargs,
    fit_kwargs=fit_kwargs,
    solve_kwargs=solve_kwargs,
    approx=False,
    start_spread = {"velocity_norm": [0.1, 0.9]},
)
Null hypothesis BIC = 1.433e+06
Sampling n_cloud = 1 posterior...
Initializing NUTS using custom advi+adapt_diag strategy
Convergence achieved at 46600
Interrupted at 46,599 [23%]: Average Loss = 5.0785e+05
Multiprocess sampling (8 chains in 8 jobs)
NUTS: [baseline_emission_norm, baseline_absorption_norm, fwhm2_norm, velocity_norm, log10_n_alpha_norm, nth_fwhm_1pc, depth_nth_fwhm_power, ff_NHI_norm, fwhm2_thermal_fraction, filling_factor, wt_ff_tkin]
Sampling 8 chains for 1_000 tune and 1_000 draw iterations (8_000 + 8_000 draws total) took 157 seconds.
Adding log-likelihood to trace
GMM converged to unique solution
n_cloud = 1 solution = 0 BIC = 1.680e+05

Sampling n_cloud = 2 posterior...
Initializing NUTS using custom advi+adapt_diag strategy
Convergence achieved at 85200
Interrupted at 85,199 [42%]: Average Loss = 2.7226e+05
Multiprocess sampling (8 chains in 8 jobs)
NUTS: [baseline_emission_norm, baseline_absorption_norm, fwhm2_norm, velocity_norm, log10_n_alpha_norm, nth_fwhm_1pc, depth_nth_fwhm_power, ff_NHI_norm, fwhm2_thermal_fraction, filling_factor, wt_ff_tkin]
Sampling 8 chains for 1_000 tune and 1_000 draw iterations (8_000 + 8_000 draws total) took 191 seconds.
Adding log-likelihood to trace
GMM converged to unique solution
n_cloud = 2 solution = 0 BIC = 5.528e+03

Sampling n_cloud = 3 posterior...
Initializing NUTS using custom advi+adapt_diag strategy
Convergence achieved at 104000
Interrupted at 103,999 [51%]: Average Loss = 2.7689e+05
Multiprocess sampling (8 chains in 8 jobs)
NUTS: [baseline_emission_norm, baseline_absorption_norm, fwhm2_norm, velocity_norm, log10_n_alpha_norm, nth_fwhm_1pc, depth_nth_fwhm_power, ff_NHI_norm, fwhm2_thermal_fraction, filling_factor, wt_ff_tkin]
Sampling 8 chains for 1_000 tune and 1_000 draw iterations (8_000 + 8_000 draws total) took 196 seconds.
Adding log-likelihood to trace
GMM converged to unique solution
n_cloud = 3 solution = 0 BIC = -1.299e+03

Sampling n_cloud = 4 posterior...
Initializing NUTS using custom advi+adapt_diag strategy
Convergence achieved at 87300
Interrupted at 87,299 [43%]: Average Loss = 4.4508e+05
Multiprocess sampling (8 chains in 8 jobs)
NUTS: [baseline_emission_norm, baseline_absorption_norm, fwhm2_norm, velocity_norm, log10_n_alpha_norm, nth_fwhm_1pc, depth_nth_fwhm_power, ff_NHI_norm, fwhm2_thermal_fraction, filling_factor, wt_ff_tkin]
Sampling 8 chains for 1_000 tune and 1_000 draw iterations (8_000 + 8_000 draws total) took 406 seconds.
Adding log-likelihood to trace
There were 245 divergences in converged chains.
GMM converged to unique solution
5 of 8 chains appear converged.
Label order mismatch in solution 0
Chain 1 order: [1 0 2 3]
Chain 2 order: [1 0 2 3]
Chain 3 order: [1 0 3 2]
Chain 4 order: [1 0 2 3]
Chain 5 order: [1 0 2 3]
Adopting (first) most common order: [1 0 2 3]
n_cloud = 4 solution = 0 BIC = -1.247e+03

Stopping criteria met.
Sampling n_cloud = 5 posterior...
Initializing NUTS using custom advi+adapt_diag strategy
Convergence achieved at 168500
Interrupted at 168,499 [84%]: Average Loss = 3.0913e+05
Multiprocess sampling (8 chains in 8 jobs)
NUTS: [baseline_emission_norm, baseline_absorption_norm, fwhm2_norm, velocity_norm, log10_n_alpha_norm, nth_fwhm_1pc, depth_nth_fwhm_power, ff_NHI_norm, fwhm2_thermal_fraction, filling_factor, wt_ff_tkin]
Sampling 8 chains for 1_000 tune and 1_000 draw iterations (8_000 + 8_000 draws total) took 425 seconds.
Adding log-likelihood to trace
There were 277 divergences in converged chains.
No solution found!
0 of 8 chains appear converged.

Stopping criteria met.
Stopping early.
[10]:
opt.best_model.n_clouds
[10]:
3
[11]:
from bayes_spec.plots import plot_predictive

posterior = opt.best_model.sample_posterior_predictive(
    thin=100, # keep one in {thin} posterior samples
)
axes = plot_predictive(opt.best_model.data, posterior.posterior_predictive)
axes.ravel()[0].sharex(axes.ravel()[1])
Sampling: [absorption, emission]
../_images/notebooks_optimization_15_3.png
[12]:
plt.plot(opt.bics.keys(), opt.bics.values(), 'ko')
plt.xlabel("Number of Components")
plt.ylabel("BIC")
[12]:
Text(0, 0.5, 'BIC')
../_images/notebooks_optimization_16_1.png
[13]:
plt.plot(opt.bics.keys(), opt.bics.values(), 'ko')
plt.xlabel("Number of Components")
plt.ylabel("BIC")
plt.ylim(-2000, 0)
[13]:
(-2000.0, 0.0)
../_images/notebooks_optimization_17_1.png
[ ]: