-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathutils_emcee_poisson.py
197 lines (161 loc) · 6.99 KB
/
utils_emcee_poisson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
import os
import emcee
import numpy as np
import pandas as pd
import utils_logging as lu
import matplotlib.pyplot as plt
from scipy.special import factorial
from scipy.optimize import curve_fit
from chainconsumer import ChainConsumer
'''
2020 A. Moller
Module for emcee fitting of a sigmoid for efficiency
assumes that data distribution is poisson
'''
def compute_sigma_level(trace1, trace2, nbins=20):
"""From a set of traces, bin by number of standard deviations"""
L, xbins, ybins = np.histogram2d(trace1, trace2, nbins)
L[L == 0] = 1E-16
logL = np.log(L)
shape = L.shape
L = L.ravel()
# obtain the indices to sort and unsort the flattened array
i_sort = np.argsort(L)[::-1]
i_unsort = np.argsort(i_sort)
L_cumsum = L[i_sort].cumsum()
L_cumsum /= L_cumsum[-1]
xbins = 0.5 * (xbins[1:] + xbins[:-1])
ybins = 0.5 * (ybins[1:] + ybins[:-1])
return xbins, ybins, L_cumsum[i_unsort].reshape(shape)
def plot_MCMC_trace(ax, xdata, ydata, trace, scatter=False, **kwargs):
"""Plot traces and contours"""
xbins, ybins, sigma = compute_sigma_level(trace[0], trace[1])
ax.contour(xbins, ybins, sigma.T, levels=[0.683, 0.955], **kwargs)
if scatter:
ax.plot(trace[0], trace[1], ',k', alpha=0.1)
ax.set_xlabel(r'$\alpha$')
ax.set_ylabel(r'$\beta$')
def plot_MCMC_model(ax, xdata, ydata, trace):
"""Plot the model and 2sigma contours"""
ax.plot(xdata, ydata, 'ok')
alpha, beta = trace[:2]
xfit = np.linspace(-20, 120, 10)
yfit = alpha[:, None] + beta[:, None] * xfit
mu = yfit.mean(0)
sig = 2 * yfit.std(0)
ax.plot(xfit, mu, '-k')
ax.fill_between(xfit, mu - sig, mu + sig, color='lightgray')
ax.set_xlabel('x')
ax.set_ylabel('y')
def fit_MCMC(df, fit_param, path_plots):
alpha_low, alpha_high = fit_param['alpha']
beta_low, beta_high = fit_param['beta']
A_low, A_high = fit_param['A']
# Define our posterior using Python functions
def log_prior(theta):
# flat prior for all parameters
A, alpha, beta = theta
if A_low < A < A_high and\
alpha_low < alpha < alpha_high and \
beta_low < beta < beta_high:
return 0.0
else:
return -np.inf
def log_likelihood(theta, x, y, ndata, nsim):
A, alpha, beta = theta
# sigmoid model for efficiency (y)
model = sigmoid_func(x, A, alpha, beta) * nsim
# Poisson likelihood of the probability of getting ndata with counts=ndata and rate=ideal number of events
log_L = - np.sum((model) + np.log(factorial(ndata)) -
(ndata * np.log(model)))
return log_L
def log_posterior(theta, x, y, ndata, nsim):
return log_prior(theta) + log_likelihood(theta, x, y, ndata, nsim)
# Here we'll set up the computation. emcee combines multiple "walkers",
# each of which is its own MCMC chain. The number of trace results will
# be nwalkers * nsteps
ndim = 3
nwalkers = 12 # number of MCMC walkers
nburn = 1000 # "burn-in" period to let chains stabilize
nsteps = 30000 # number of MCMC steps to take
# set theta near the maximum likelihood, with
np.random.seed(0)
starting_guesses = np.random.random((nwalkers, ndim))
# Here's the function call where all the work happens:
sampler = emcee.EnsembleSampler(
nwalkers, ndim, log_posterior, args=[df['x'].values, df['ratio'].values, df['ndata'].values, df['nsim'].values])
# Clear and run the production chain.
pos_ini = [np.mean(fit_param[key]) for key in fit_param.keys()]
pos = [pos_ini + 1e-4 * np.random.randn(ndim) for i in range(nwalkers)]
sampler.run_mcmc(pos, nsteps, rstate0=np.random.get_state())
for var in range(ndim):
plt.clf()
plt.plot(sampler.chain[:, :, var].T, color="k", alpha=0.4)
plt.savefig(path_plots + '/line-time_' + str(var) + '.png')
samples = sampler.chain[:, nburn:, :].reshape((-1, ndim))
A_mcmc, alpha_mcmc, beta_mcmc = map(lambda v: (v[1], v[2] - v[1], v[1] - v[0]),
zip(*np.percentile(samples, [16, 50, 84],
axis=0)))
list_mcmc = [A_mcmc, alpha_mcmc, beta_mcmc]
theta_mcmc = [p[0] for p in list_mcmc]
min_theta_mcmc = [p[0] - p[1] for p in list_mcmc]
max_theta_mcmc = [p[0] + p[2] for p in list_mcmc]
list_mcmc = map(str, list_mcmc)
# do corner plot
# plt.clf()
# cha = ChainConsumer()
# cha.add_chain(samples, parameters=['a', 'alpha', 'beta'])
# fig = cha.plotter.plot(filename=f"{path_plots}/triangle.png")
plt.clf()
variable = df.ratio_variable
xx = np.linspace(df['x'].min(), df['x'].max(), 200)
plt.plot(xx, sigmoid_func(xx, min_theta_mcmc[0], min_theta_mcmc[1],
min_theta_mcmc[2]) / theta_mcmc[0], color='yellow',)
plt.plot(xx, sigmoid_func(xx, max_theta_mcmc[0], max_theta_mcmc[1], max_theta_mcmc[2]) / theta_mcmc[0], color='yellow',
label='1 sigma') # mcmc fit
plt.plot(xx, sigmoid_func(xx, theta_mcmc[0], theta_mcmc[1], theta_mcmc[2]) / theta_mcmc[0],
color='orange', label='Emcee sigmoid fit') # mcmc fit
plt.errorbar(df['x'], df['ratio'] / theta_mcmc[0], yerr=df['err_ratio'], fmt='o',
label="emcee selection function")
plt.ylim(-.1, 1.1)
plt.xlabel(variable)
plt.legend()
plt.savefig(f"{path_plots}/fitted_model_mcmc_{variable}.png")
return theta_mcmc, min_theta_mcmc, max_theta_mcmc
def sigmoid_func(x, a, alph, bet):
return a / (1 + np.exp((+alph * x) - bet))
def emcee_fitting(df, path_plots,min_var=15):
# Functional fit for initial guess
# Initial params
low_bounds = [0.1, 1., 20]
high_bounds = [4, 4, 70]
popt, pcov = curve_fit(sigmoid_func,
df['x'].values, df['ratio'].values)
lu.print_green('Functional initial guess', popt)
low_bounds = [p - 3 * p / 10. for p in popt]
high_bounds = [p + 3 * p / 10. for p in popt]
# Plot initial guess
variable = df.ratio_variable
plt.clf()
fig = plt.figure()
xx = np.linspace(df['x'].min(), df['x'].max(), 200)
plt.errorbar(df['x'], df['ratio'], yerr=df['err_ratio'], fmt='o')
plt.plot(xx, sigmoid_func(xx, popt[0], popt[1], popt[2]) /
popt[0], color='orange', label='Functional sigmoid fit')
plt.xlabel(variable)
plt.ylabel('ratio')
plt.ylim(-.1, 1.1)
plt.savefig(f"{path_plots}/fitted_model_func_{variable}.png")
plt.clf()
# save initial fit params
fit_param = {}
fit_param['A'] = (low_bounds[0], high_bounds[0])
fit_param['alpha'] = (low_bounds[1], high_bounds[1])
fit_param['beta'] = (low_bounds[2], high_bounds[2])
lu.print_blue(
'Emcee fitting sigmoid to data/simulation ratio (Poisson errors)')
theta_mcmc, min_theta_mcmc, max_theta_mcmc = fit_MCMC(
df, fit_param, path_plots)
lu.print_green('emcee:',theta_mcmc)
lu.print_blue('Finished emcee')
return theta_mcmc, min_theta_mcmc, max_theta_mcmc