1. Levenberg Marquardt Fitting#

1.1. Conventions and best approaches#

S0 Signal for unbound state
S1 Signal for bound state
K equilibrium constant (Kd or pKa)
order data from unbound to bound (e.g. cl: 0–>150 mM; pH 9–>5)

1.2. Initial imports#

[1]:

import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import lmfit

1.3. Single Cl titration.#

[2]:

df = pd.read_table("../../tests/data/copyIP.txt")
sb.scatterplot(
    data=df,
    x="cl",
    y="F",
    hue=df.cl * df.F,
    palette="crest",
    s=200,
    alpha=0.8,
    legend=False,
)

[2]:

<Axes: xlabel='cl', ylabel='F'>

[3]:

def residual(pars, x, y=None):
    S0 = pars["S0"]
    S1 = pars["S1"]
    Kd = pars["Kd"]
    model = (S0 + S1 * x / Kd) / (1 + x / Kd)
    if y is None:
        return model
    return y - model


params = lmfit.Parameters()
params.add("S0", value=df.F[0])
params.add("S1", value=100)
params.add("Kd", value=50, vary=True)

out = lmfit.minimize(
    residual,
    params,
    args=(
        df.cl,
        df.F,
    ),
)

xdelta = (df.cl.max() - df.cl.min()) / 500
xfit = np.arange(df.cl.min() - xdelta, df.cl.max() + xdelta, xdelta)
yfit = residual(out.params, xfit)
print(lmfit.fit_report(out.params))
plt.plot(df.cl, df.F, "o", xfit, yfit, "-")

[[Variables]]
    S0:  13408867.8 +/- 87130.3362 (0.65%) (init = 1.33579e+07)
    S1:  563537.064 +/- 106411.960 (18.88%) (init = 100)
    Kd:  58.3187767 +/- 2.24671605 (3.85%) (init = 50)
[[Correlations]] (unreported correlations are < 0.100)
    C(S1, Kd) = -0.7123
    C(S0, Kd) = -0.6562
    C(S0, S1) = +0.2747

[3]:

[<matplotlib.lines.Line2D at 0x7f52873b9c90>,
 <matplotlib.lines.Line2D at 0x7f528737d750>]

[4]:

def residuals(p):
    S0 = p["S0"]
    S1 = p["S1"]
    Kd = p["Kd"]
    model = (S0 + S1 * df.cl / Kd) / (1 + df.cl / Kd)
    return model - df.F


mini = lmfit.Minimizer(residuals, params)
res = mini.minimize()
ci, tr = lmfit.conf_interval(mini, res, sigmas=[0.68, 0.95], trace=True)
print(lmfit.ci_report(ci, with_offset=False, ndigits=2))
print(lmfit.fit_report(res, show_correl=False, sort_pars=True))

    95.00% 68.00% _BEST_ 68.00% 95.00%
 S0:13197616.3413314946.3513408867.7913503300.3813622729.13
 S1:300911.47447991.63563537.06677869.66819977.61
 Kd:  53.13  55.96  58.32  60.79  64.07
[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 17
    # data points      = 9
    # variables        = 3
    chi-square         = 8.3839e+10
    reduced chi-square = 1.3973e+10
    Akaike info crit   = 212.594471
    Bayesian info crit = 213.186145
[[Variables]]
    Kd:  58.3187767 +/- 2.24671605 (3.85%) (init = 50)
    S0:  13408867.8 +/- 87130.3362 (0.65%) (init = 1.33579e+07)
    S1:  563537.064 +/- 106411.960 (18.88%) (init = 100)

[5]:

names = res.params.keys()
i = 0
gs = plt.GridSpec(4, 4)
sx = {}
sy = {}
for fixed in names:
    j = 0
    for free in names:
        if j in sx and i in sy:
            ax = plt.subplot(gs[i, j], sharex=sx[j], sharey=sy[i])
        elif i in sy:
            ax = plt.subplot(gs[i, j], sharey=sy[i])
            sx[j] = ax
        elif j in sx:
            ax = plt.subplot(gs[i, j], sharex=sx[j])
            sy[i] = ax
        else:
            ax = plt.subplot(gs[i, j])
            sy[i] = ax
            sx[j] = ax
        if i < 3:
            plt.setp(ax.get_xticklabels(), visible=True)
        else:
            ax.set_xlabel(free)

        if j > 0:
            plt.setp(ax.get_yticklabels(), visible=False)
        else:
            ax.set_ylabel(fixed)

        rest = tr[fixed]
        prob = rest["prob"]
        f = prob < 0.96

        x, y = rest[free], rest[fixed]
        ax.scatter(x[f], y[f], c=1 - prob[f], s=25 * (1 - prob[f] + 0.5))
        ax.autoscale(1, 1)
        j += 1
    i += 1

[6]:

names = list(res.params.keys())

plt.figure()
for i in range(3):
    for j in range(3):
        indx = 9 - j * 3 - i
        ax = plt.subplot(3, 3, indx)
        ax.ticklabel_format(style="sci", scilimits=(-2, 2), axis="y")

        # set-up labels and tick marks
        ax.tick_params(labelleft=False, labelbottom=False)
        if indx in (1, 4, 7):
            plt.ylabel(names[j])
            ax.tick_params(labelleft=True)
        if indx == 1:
            ax.tick_params(labelleft=True)
        if indx in (7, 8, 9):
            plt.xlabel(names[i])
            ax.tick_params(labelbottom=True)
            [label.set_rotation(45) for label in ax.get_xticklabels()]

        if i != j:
            x, y, m = lmfit.conf_interval2d(mini, res, names[i], names[j], 20, 20)
            plt.contourf(x, y, m, np.linspace(0, 1, 10))

            x = tr[names[i]][names[i]]
            y = tr[names[i]][names[j]]
            pr = tr[names[i]]["prob"]
            s = np.argsort(x)
            plt.scatter(x[s], y[s], c=pr[s], s=30, lw=1)

        else:
            x = tr[names[i]][names[i]]
            y = tr[names[i]]["prob"]

            t, s = np.unique(x, True)
            f = scipy.interpolate.interp1d(t, y[s], "slinear")
            xn = np.linspace(x.min(), x.max(), 50)
            plt.plot(xn, f(xn), lw=1)
            plt.ylabel("prob")
            ax.tick_params(labelleft=True)

plt.tight_layout()

[7]:

lmfit.report_fit(out.params, min_correl=0.25)

ci, trace = lmfit.conf_interval(mini, res, sigmas=[1, 2], trace=True)
lmfit.printfuncs.report_ci(ci)

fig, axes = plt.subplots(2, 2, figsize=(12.8, 9.6), sharey=True)
cx1, cy1, prob = trace["S0"]["S0"], trace["S0"]["Kd"], trace["S0"]["prob"]
cx2, cy2, prob2 = trace["S1"]["S1"], trace["S1"]["Kd"], trace["S1"]["prob"]

axes[0][0].scatter(cx1, cy1, c=prob, s=30)
axes[0][0].set_xlabel("S0")
axes[0][0].set_ylabel("Kd")

axes[0][1].scatter(cx2, cy2, c=prob2, s=30)
axes[0][1].set_xlabel("S1")

cx, cy, grid = lmfit.conf_interval2d(mini, res, "S0", "Kd", 30, 30)
ctp = axes[1][0].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[1][0])
axes[1][0].set_xlabel("S0")
axes[1][0].set_ylabel("Kd")

cx, cy, grid = lmfit.conf_interval2d(mini, res, "S1", "Kd", 30, 30)
ctp = axes[1][1].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[1][1])
axes[1][1].set_xlabel("S1")
axes[1][1].set_ylabel("Kd")

[[Variables]]
    S0:  13408867.8 +/- 87130.3362 (0.65%) (init = 1.33579e+07)
    S1:  563537.064 +/- 106411.960 (18.88%) (init = 100)
    Kd:  58.3187767 +/- 2.24671605 (3.85%) (init = 50)
[[Correlations]] (unreported correlations are < 0.250)
    C(S1, Kd) = -0.7123
    C(S0, Kd) = -0.6562
    C(S0, S1) = +0.2747
       95.45%    68.27%    _BEST_    68.27%    95.45%
 S0:-217226.68396-94491.4465413408867.78781+95008.79621+219988.97144
 S1:-270193.31667-116252.05687563537.06418+115024.41863+263650.82002
 Kd:  -5.32812  -2.37783  58.31878  +2.48963  +5.92424

[7]:

Text(0, 0.5, 'Kd')

[8]:

x, y, grid = lmfit.conf_interval2d(mini, res, "S0", "S1", 30, 30)
plt.contourf(x, y, grid, np.linspace(0, 1, 11))
plt.xlabel("S0")
plt.colorbar()
plt.ylabel("S1")

[8]:

Text(0, 0.5, 'S1')

1.4. Fit titration with multiple ts#

For example data collected with multiple labelblocks in Tecan plate reader.

“A01”: pH titration with y1 and y2.

[9]:

df = pd.read_csv("../../tests/data/A01.dat", sep=" ", names=["x", "y1", "y2"])
df = df[::-1].reset_index(drop=True)
df

[9]:

	x	y1	y2
0	9.030000	29657.0	22885.0
1	8.373333	35200.0	16930.0
2	7.750000	44901.0	9218.0
3	7.073333	53063.0	3758.0
4	6.460000	54202.0	2101.0
5	5.813333	54851.0	1542.0
6	4.996667	51205.0	1358.0

1.4.1. lmfit of single y1 using analytical Jacobian#

It computes the Jacobian of the fz. Mind that the residual (i.e. y - fz) will be actually minimized.

[10]:

import sympy

x, S0_1, S1_1, K = sympy.symbols("x S0_1 S1_1 K")
f = (S0_1 + S1_1 * 10 ** (K - x)) / (1 + 10 ** (K - x))
print(sympy.diff(f, S0_1))
print(sympy.diff(f, S1_1))
print(sympy.diff(f, K))

1/(10**(K - x) + 1)
10**(K - x)/(10**(K - x) + 1)
10**(K - x)*S1_1*log(10)/(10**(K - x) + 1) - 10**(K - x)*(10**(K - x)*S1_1 + S0_1)*log(10)/(10**(K - x) + 1)**2

[11]:

f2 = (S0_1 + S1_1 * x / K) / (1 + x / K)
print(sympy.diff(f2, S0_1))
print(sympy.diff(f2, S1_1))
print(sympy.diff(f2, K))

1/(1 + x/K)
x/(K*(1 + x/K))
-S1_1*x/(K**2*(1 + x/K)) + x*(S0_1 + S1_1*x/K)/(K**2*(1 + x/K)**2)

[12]:

def residual(pars, x, data):
    S0 = pars["S0"]
    S1 = pars["S1"]
    K = pars["K"]
    # model = (S0 + S1 * x / Kd) / (1 + x / Kd)
    x = np.array(x)
    y = np.array(data)
    model = (S0 + S1 * 10 ** (K - x)) / (1 + 10 ** (K - x))
    if data is None:
        return model
    return y - model


# Try Jacobian
def dfunc(pars, x, data=None):
    S0_1 = pars["S0"]
    S1_1 = pars["S1"]
    K = pars["K"]
    kx = np.array(10 ** (K - x))
    return np.array(
        [
            -1 / (kx + 1),
            -kx / (kx + 1),
            -kx * np.log(10) * (S1_1 / (kx + 1) - (kx * S1_1 + S0_1) / (kx + 1) ** 2),
        ]
    )
    # kx * S1_1 * np.log(10) / (kx + 1) - kx * (kx * S1_1 + S0_1) * np.log(10) / (kx + 1)**2])


params = lmfit.Parameters()
params.add("S0", value=25000)
params.add("S1", value=50000, min=0.0)
params.add("K", value=7, min=2.0, max=12.0)

# out = lmfit.minimize(residual, params, args=(df.x,), kws={'data':df.y1})
# mini = lmfit.Minimizer(residual, params, fcn_args=(df.x, df.y2))
mini = lmfit.Minimizer(residual, params, fcn_args=(df.x,), fcn_kws={"data": df.y1})
# res= mini.minimize()
res = mini.leastsq(Dfun=dfunc, col_deriv=True, ftol=1e-8)

fit = residual(params, df.x, None)
print(lmfit.report_fit(res))

ci = lmfit.conf_interval(mini, res, sigmas=[1, 2, 3])
lmfit.printfuncs.report_ci(ci)

[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 9
    # data points      = 7
    # variables        = 3
    chi-square         = 12308015.2
    reduced chi-square = 3077003.79
    Akaike info crit   = 106.658958
    Bayesian info crit = 106.496688
[[Variables]]
    S0:  26638.8377 +/- 2455.91825 (9.22%) (init = 25000)
    S1:  54043.3592 +/- 979.995977 (1.81%) (init = 50000)
    K:   8.06961091 +/- 0.14940678 (1.85%) (init = 7)
[[Correlations]] (unreported correlations are < 0.100)
    C(S0, K)  = -0.7750
    C(S1, K)  = -0.4552
    C(S0, S1) = +0.2046
None
       99.73%    95.45%    68.27%    _BEST_    68.27%    95.45%    99.73%
 S0:-85235.74104-8376.40372-2895.5618126638.83771+2558.77424+5999.31275+12360.60692
 S1:-6192.81418-2734.30623-1098.2204254043.35921+1113.18257+2829.54298+6725.37841
 K :  -0.98139  -0.40197  -0.15949   8.06961  +0.16276  +0.42591  +1.50918

[13]:

print(lmfit.ci_report(ci, with_offset=False, ndigits=2))

    99.73% 95.45% 68.27% _BEST_ 68.27% 95.45% 99.73%
 S0:-58596.9018262.4323743.2826638.8429197.6132638.1538999.44
 S1:47850.5551309.0552945.1454043.3655156.5456872.9060768.74
 K :   7.09   7.67   7.91   8.07   8.23   8.50   9.58

1.4.1.1. emcee#

[14]:

res.params.add("__lnsigma", value=np.log(0.1), min=np.log(0.001), max=np.log(1e4))
resMC = lmfit.minimize(
    residual, method="emcee", nan_policy="omit", params=res.params, args=(df.x, df.y1)
)

100%|██████████| 1000/1000 [00:11<00:00, 90.89it/s]

The chain is shorter than 50 times the integrated autocorrelation time for 4 parameter(s). Use this estimate with caution and run a longer chain!
N/50 = 20;
tau: [36.01534895 31.14085481 31.73843752 49.51739801]

[15]:

plt.plot(resMC.acceptance_fraction, "o")
plt.xlabel("walker")
plt.ylabel("acceptance frac")

[15]:

Text(0, 0.5, 'acceptance frac')

[16]:

import corner

tr = [v for v in resMC.params.valuesdict().values()]
emcee_plot = corner.corner(
    resMC.flatchain,
    labels=resMC.var_names,
    truths=list(resMC.params.valuesdict().values()),
)
# truths=tr[:-1])

WARNING:root:Pandas support in corner is deprecated; use ArviZ directly
WARNING:root:Too few points to create valid contours

1.4.2. using lmfit with np.r_ trick#

[17]:

# %%timeit #62ms
def residual2(pars, x, data=None):
    K = pars["K"]
    S0_1 = pars["S0_1"]
    S1_1 = pars["S1_1"]
    S0_2 = pars["S0_2"]
    S1_2 = pars["S1_2"]
    model_0 = (S0_1 + S1_1 * 10 ** (K - x[0])) / (1 + 10 ** (K - x[0]))
    model_1 = (S0_2 + S1_2 * 10 ** (K - x[1])) / (1 + 10 ** (K - x[1]))
    if data is None:
        return np.r_[model_0, model_1]
    return np.r_[data[0] - model_0, data[1] - model_1]


params2 = lmfit.Parameters()
params2.add("K", value=7.0, min=2.0, max=12.0)
params2.add("S0_1", value=df.y1[0], min=0.0)
params2.add("S0_2", value=df.y2[0], min=0.0)
params2.add("S1_1", value=df.y1.iloc[-1], min=0.0)
params2.add("S1_2", value=df.y2.iloc[-1], min=0.0)
mini2 = lmfit.Minimizer(
    residual2, params2, fcn_args=([df.x, df.x],), fcn_kws={"data": [df.y1, df.y2]}
)
res2 = mini2.minimize()
print(lmfit.fit_report(res2))

ci2, tr2 = lmfit.conf_interval(mini2, res2, sigmas=[0.68, 0.95], trace=True)
print(lmfit.ci_report(ci2, with_offset=False, ndigits=2))

[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 37
    # data points      = 14
    # variables        = 5
    chi-square         = 12471473.3
    reduced chi-square = 1385719.25
    Akaike info crit   = 201.798560
    Bayesian info crit = 204.993846
[[Variables]]
    K:     8.07255029 +/- 0.07600777 (0.94%) (init = 7)
    S0_1:  26601.3458 +/- 1425.69913 (5.36%) (init = 29657)
    S0_2:  25084.4189 +/- 1337.07982 (5.33%) (init = 22885)
    S1_1:  54034.5806 +/- 627.642479 (1.16%) (init = 51205)
    S1_2:  1473.57871 +/- 616.944649 (41.87%) (init = 1358)
[[Correlations]] (unreported correlations are < 0.100)
    C(K, S0_1)    = -0.6816
    C(K, S0_2)    = +0.6255
    C(S0_1, S0_2) = -0.4264
    C(K, S1_1)    = -0.3611
    C(K, S1_2)    = +0.3161
    C(S0_2, S1_1) = -0.2259
    C(S0_1, S1_2) = -0.2155
    C(S1_1, S1_2) = -0.1141
      95.00% 68.00% _BEST_ 68.00% 95.00%
 K   :   7.91   7.99   8.07   8.15   8.24
 S0_1:23210.9025078.6226601.3528045.4929623.53
 S0_2:22232.9723723.9425084.4226514.8828263.75
 S1_1:52629.0453378.2454034.5854695.2655460.17
 S1_2:  72.04 824.011473.582118.982855.89

[18]:

xfit = np.linspace(df.x.min(), df.x.max(), 100)
yfit0 = residual2(params2, [xfit, xfit])
yfit0 = yfit0.reshape(2, 100)
yfit = residual2(res2.params, [xfit, xfit])
yfit = yfit.reshape(2, 100)
plt.plot(df.x, df.y1, "o", df.x, df.y2, "s")
plt.plot(xfit, yfit[0], "-", xfit, yfit[1], "-")
plt.plot(xfit, yfit0[0], "--", xfit, yfit0[1], "--")
plt.grid(True)

1.4.3. lmfit constraints aiming for generality#

I believe a name convention would be more robust than relying on OrderedDict Params object.

[19]:

"S0_1".split("_")[0]

[19]:

'S0'

[20]:

def exception_fcn_handler(func):
    def inner_function(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except TypeError:
            print(
                f"{func.__name__} only takes (1D) vector as argument besides lmfit.Parameters."
            )

    return inner_function


@exception_fcn_handler
def titration_pH(params, pH):
    p = {k.split("_")[0]: v for k, v in params.items()}
    return (p["S0"] + p["S1"] * 10 ** (p["K"] - pH)) / (1 + 10 ** (p["K"] - pH))


def residues(params, x, y, fcn):
    return y - fcn(params, x)


p1 = lmfit.Parameters()
p2 = lmfit.Parameters()
p1.add("K_1", value=7.0, min=2.0, max=12.0)
p2.add("K_2", value=7.0, min=2.0, max=12.0)
p1.add("S0_1", value=df.y1.iloc[0], min=0.0)
p2.add("S0_2", value=df.y2.iloc[0], min=0.0)
p1.add("S1_1", value=df.y1.iloc[-1], min=0.0)
p2.add("S1_2", value=df.y2.iloc[-1])

print(
    residues(p1, np.array(df.x), [1.97, 1.8, 1.7, 0.1, 0.1, 0.16, 0.01], titration_pH)
)


def gobjective(params, xl, yl, fcnl):
    nset = len(xl)
    res = []
    for i in range(nset):
        pi = {k: v for k, v in params.valuesdict().items() if k[-1] == f"{i+1}"}
        res = np.r_[res, residues(pi, xl[i], yl[i], fcnl[i])]
        # res = np.r_[res, yl[i] - fcnl[i](parsl[i], x[i])]
    return res


print(gobjective(p1 + p2, [df.x, df.x], [df.y1, df.y2], [titration_pH, titration_pH]))

[-29854.26823732 -30530.32007939 -32908.60749879 -39523.42660007
 -46381.47878947 -49888.5091843  -50993.25866394]
[  -199.23823732   4667.87992061  11990.69250121  13539.47339993
   7820.42121053   4962.3308157     211.73133606    199.04406603
  -5080.73278499 -10416.86307191  -9270.08900503  -4075.72045662
  -1131.04796128   -211.52498939]

Here single.

[21]:

mini = lmfit.Minimizer(
    residues,
    p1,
    fcn_args=(
        df.x,
        df.y1,
        titration_pH,
    ),
)
res = mini.minimize()

fit = titration_pH(res.params, df.x)
print(lmfit.report_fit(res))
plt.plot(df.x, df.y1, "o", df.x, fit, "--")
ci = lmfit.conf_interval(mini, res, sigmas=[1, 2])
lmfit.printfuncs.report_ci(ci)

[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 25
    # data points      = 7
    # variables        = 3
    chi-square         = 12308015.2
    reduced chi-square = 3077003.79
    Akaike info crit   = 106.658958
    Bayesian info crit = 106.496688
[[Variables]]
    K_1:   8.06961042 +/- 0.14940740 (1.85%) (init = 7)
    S0_1:  26638.8440 +/- 2455.92762 (9.22%) (init = 29657)
    S1_1:  54043.3607 +/- 979.995185 (1.81%) (init = 51205)
[[Correlations]] (unreported correlations are < 0.100)
    C(K_1, S0_1)  = -0.7750
    C(K_1, S1_1)  = -0.4552
    C(S0_1, S1_1) = +0.2046
None
         95.45%    68.27%    _BEST_    68.27%    95.45%
 K_1 :  -0.40197  -0.15948   8.06961  +0.16276  +0.42592
 S0_1:-8376.39586-2895.5681226638.84401+2558.76794+5999.32366
 S1_1:-2734.30835-1098.2218354043.36069+1113.18102+2829.55353

Now global.

[22]:

pg = p1 + p2
pg["K_2"].expr = "K_1"
gmini = lmfit.Minimizer(
    gobjective,
    pg,
    fcn_args=([df.x[:], df.x], [df.y1[:], df.y2], [titration_pH, titration_pH]),
)
gres = gmini.minimize()
print(lmfit.fit_report(gres))

pp1 = {k: v for k, v in gres.params.valuesdict().items() if k.split("_")[1] == f"{1}"}
pp2 = {k: v for k, v in gres.params.valuesdict().items() if k.split("_")[1] == f"{2}"}
xfit = np.linspace(df.x.min(), df.x.max(), 100)
yfit1 = titration_pH(pp1, xfit)
yfit2 = titration_pH(pp2, xfit)
plt.plot(df.x, df.y1, "o", xfit, yfit1, "--")
plt.plot(df.x, df.y2, "s", xfit, yfit2, "--")

[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 37
    # data points      = 14
    # variables        = 5
    chi-square         = 12471473.3
    reduced chi-square = 1385719.25
    Akaike info crit   = 201.798560
    Bayesian info crit = 204.993846
[[Variables]]
    K_1:   8.07255029 +/- 0.07600777 (0.94%) (init = 7)
    S0_1:  26601.3458 +/- 1425.69913 (5.36%) (init = 29657)
    S1_1:  54034.5806 +/- 627.642480 (1.16%) (init = 51205)
    K_2:   8.07255029 +/- 0.07600777 (0.94%) == 'K_1'
    S0_2:  25084.4189 +/- 1337.07982 (5.33%) (init = 22885)
    S1_2:  1473.57871 +/- 616.944649 (41.87%) (init = 1358)
[[Correlations]] (unreported correlations are < 0.100)
    C(K_1, S0_1)  = -0.6816
    C(K_1, S0_2)  = +0.6255
    C(S0_1, S0_2) = -0.4264
    C(K_1, S1_1)  = -0.3611
    C(K_1, S1_2)  = +0.3161
    C(S1_1, S0_2) = -0.2259
    C(S0_1, S1_2) = -0.2155
    C(S1_1, S1_2) = -0.1141

[22]:

[<matplotlib.lines.Line2D at 0x7f52802ebbd0>,
 <matplotlib.lines.Line2D at 0x7f528031ed90>]

[23]:

ci = lmfit.conf_interval(gmini, gres)
print(lmfit.ci_report(ci, with_offset=False, ndigits=2))

      99.73% 95.45% 68.27% _BEST_ 68.27% 95.45% 99.73%
 K_1 :   7.77   7.90   7.99   8.07   8.15   8.25   8.38
 S0_1:20066.1223118.2625069.3726601.3528053.8229696.8331876.24
 S1_1:51504.2152593.4753374.3654034.5854699.1855496.7856630.81
 S0_2:20096.2422163.6223716.0825084.4226523.5528350.2131192.55
 S1_2:-1078.82  36.05 820.171473.582122.782890.883962.77

To plot ci for the 5 parameters.

[24]:

fig, axes = plt.subplots(1, 4, figsize=(24.2, 4.8), sharey=True)
cx, cy, grid = lmfit.conf_interval2d(gmini, gres, "S0_1", "K_1", 25, 25)
ctp = axes[0].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[0])
axes[0].set_xlabel("SA1")
axes[0].set_ylabel("pK1")
cx, cy, grid = lmfit.conf_interval2d(gmini, gres, "S0_2", "K_1", 25, 25)
ctp = axes[1].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[1])
axes[1].set_xlabel("SA2")
axes[1].set_ylabel("pK1")
cx, cy, grid = lmfit.conf_interval2d(gmini, gres, "S1_1", "K_1", 25, 25)
ctp = axes[2].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[2])
axes[2].set_xlabel("SB1")
axes[2].set_ylabel("pK1")
cx, cy, grid = lmfit.conf_interval2d(gmini, gres, "S1_2", "K_1", 25, 25)
ctp = axes[3].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[3])
axes[3].set_xlabel("SB2")
axes[3].set_ylabel("pK1")

[24]:

Text(0, 0.5, 'pK1')

[25]:

plt.plot(np.r_[df.x, df.x], gres.residual, "o")

[25]:

[<matplotlib.lines.Line2D at 0x7f527bfdf950>]

1.4.3.1. emcee#

[26]:

gmini.params.add("__lnsigma", value=np.log(0.1), min=np.log(0.001), max=np.log(20000))
gresMC = lmfit.minimize(
    gobjective,
    method="emcee",
    nan_policy="omit",
    params=gmini.params,
    args=([df.x, df.x], [df.y1, df.y2], [titration_pH, titration_pH]),
)

100%|██████████| 1000/1000 [02:48<00:00,  5.95it/s]

The chain is shorter than 50 times the integrated autocorrelation time for 6 parameter(s). Use this estimate with caution and run a longer chain!
N/50 = 20;
tau: [36.71359231 44.74326834 40.73146614 39.35657715 43.99983353 67.46082702]

This next block comes from: https://lmfit.github.io/lmfit-py/examples/example_emcee_Model_interface.html?highlight=emcee

[27]:

plt.plot(gresMC.acceptance_fraction, "o")
plt.xlabel("walker")
plt.ylabel("acceptance frac")

[27]:

Text(0, 0.5, 'acceptance frac')

[28]:

import corner

tr = [v for v in gresMC.params.valuesdict().values()]
emcee_plot = corner.corner(gresMC.flatchain, labels=gresMC.var_names, truths=tr[:-1])

WARNING:root:Pandas support in corner is deprecated; use ArviZ directly
WARNING:root:Too few points to create valid contours

[29]:

lmfit.report_fit(gresMC.params)

[[Variables]]
    K_1:        8.07254988 +/- 6.3733e-05 (0.00%) (init = 8.07255)
    S0_1:       26601.3509 +/- 1.20402118 (0.00%) (init = 26601.35)
    S1_1:       54034.5743 +/- 0.56681339 (0.00%) (init = 54034.58)
    K_2:        8.07254988 == 'K_1'
    S0_2:       25084.4236 +/- 1.14204011 (0.00%) (init = 25084.42)
    S1_2:       1473.59159 +/- 0.51596582 (0.04%) (init = 1473.579)
    __lnsigma: -1.63343046 +/- 4.83356099 (295.91%) (init = -2.302585)
[[Correlations]] (unreported correlations are < 0.100)
    C(K_1, S0_1)  = -0.4722
    C(K_1, S0_2)  = +0.4618
    C(S0_1, S0_2) = -0.3722
    C(K_1, S1_1)  = -0.2414
    C(K_1, S1_2)  = +0.2090
    C(S0_1, S1_2) = -0.1930
    C(S1_1, S0_2) = -0.1591

[30]:

highest_prob = np.argmax(gresMC.lnprob)
hp_loc = np.unravel_index(highest_prob, gresMC.lnprob.shape)
mle_soln = gresMC.chain[hp_loc]
for i, par in enumerate(pg):
    pg[par].value = mle_soln[i]

header = "\nMaximum Likelihood Estimation from emcee"
line = "-------------------------------------------------"
format_string = "{:<5s} {:>11s} {:>11s} {:>11s}"

print(f"{header}\n{line}")
print(format_string.format("Parameter", "MLE Value", "Median Value", "Uncertainty"))

for name, param in pg.items():
    mle_value = f"{param.value:.5f}"
    median_value = f"{gresMC.params[name].value:.5f}"
    uncertainty = (
        "None"
        if gresMC.params[name].stderr is None
        else f"{gresMC.params[name].stderr:.5f}"
    )
    print(format_string.format(name, mle_value, median_value, uncertainty))


Maximum Likelihood Estimation from emcee
-------------------------------------------------
Parameter   MLE Value Median Value Uncertainty
K_1       8.07255     8.07255     0.00006
S0_1  26601.18265 26601.35094     1.20402
S1_1  54034.56651 54034.57435     0.56681
K_2       8.07255     8.07255        None
S0_2   1473.64690 25084.42361     1.14204
S1_2     -5.59259  1473.59159     0.51597

[31]:

print("\nError estimates from emcee:")
print("------------------------------------------------------")
print("Parameter  -2sigma  -1sigma   median  +1sigma  +2sigma")

format_string = "  {:5s}   {:8.4f} {:8.4f} {:8.4f} {:8.4f} {:8.4f}"

for name in pg.keys():
    if name in gresMC.flatchain:
        quantiles = np.percentile(
            gresMC.flatchain[name], [2.275, 15.865, 50, 84.135, 97.275]
        )
        median = quantiles[2]
        errors = quantiles - median
        print(format_string.format(name, *errors))
    else:
        print(f"Key '{name}' not found in gresMC.flatchain.")


Error estimates from emcee:
------------------------------------------------------
Parameter  -2sigma  -1sigma   median  +1sigma  +2sigma
  K_1      -0.0001  -0.0001   0.0000   0.0001   0.0001
  S0_1     -2.4692  -1.2159   0.0000   1.1926   2.2931
  S1_1     -1.1518  -0.5598   0.0000   0.5741   1.1041
Key 'K_2' not found in gresMC.flatchain.
  S0_2     -2.2575  -1.1402   0.0000   1.1445   2.2271
  S1_2     -1.0474  -0.5197   0.0000   0.5125   1.0101

[32]:

emcee_params = gmini.params.copy()
emcee_params.add("__lnsigma", value=np.log(0.1), min=np.log(0.001), max=np.log(2.0))

mi = lmfit.Minimizer(
    gobjective,
    emcee_params,
    fcn_args=([df.x, df.x], [df.y1, df.y2], [titration_pH, titration_pH]),
)

res_emcee = mi.minimize(method="emcee", steps=500, burn=50, thin=20, is_weighted=False)

lmfit.report_fit(res_emcee)

100%|██████████| 500/500 [01:28<00:00,  5.64it/s]

The chain is shorter than 50 times the integrated autocorrelation time for 6 parameter(s). Use this estimate with caution and run a longer chain!
N/50 = 10;
tau: [41.43608937 38.26073716 41.64620364 41.45277181 43.73839237 55.06682383]
[[Fit Statistics]]
    # fitting method   = emcee
    # function evals   = 50000
    # data points      = 14
    # variables        = 6
    chi-square         = 3119313.76
    reduced chi-square = 389914.221
    Akaike info crit   = 184.396928
    Bayesian info crit = 188.231272
[[Variables]]
    K_1:        8.07377845 +/- 0.08659606 (1.07%) (init = 8.07255)
    S0_1:       26594.2765 +/- 2344.99613 (8.82%) (init = 26601.35)
    S1_1:       54026.3984 +/- 563.263532 (1.04%) (init = 54034.58)
    K_2:        8.07377845 == 'K_1'
    S0_2:       25089.9684 +/- 948.437394 (3.78%) (init = 25084.42)
    S1_2:       1493.60468 +/- 242.550345 (16.24%) (init = 1473.579)
    __lnsigma:  0.69298263 +/- 0.04693269 (6.77%) (init = -2.302585)
[[Correlations]] (unreported correlations are < 0.100)
    C(K_1, S0_1)       = -0.9674
    C(S0_1, S1_2)      = -0.9653
    C(K_1, S1_1)       = -0.9510
    C(S0_1, S1_1)      = +0.9506
    C(S1_1, S1_2)      = -0.9274
    C(K_1, S1_2)       = +0.9264
    C(K_1, S0_2)       = +0.8727
    C(S1_1, S0_2)      = -0.8094
    C(S0_1, S0_2)      = -0.7885
    C(S0_2, S1_2)      = +0.7313
    C(S1_1, __lnsigma) = +0.2094
    C(S0_1, __lnsigma) = +0.2085
    C(S1_2, __lnsigma) = -0.1999
    C(K_1, __lnsigma)  = -0.1780
    C(S0_2, __lnsigma) = -0.1068

1.4.4. bootstrap con pandas#

[33]:

for i in range(100):
    tdf = pd.DataFrame([(j, i) for i in range(7) for j in range(2)]).sample(
        14, replace=True, ignore_index=False
    )
    df1 = df[["x", "y1"]].iloc[np.array(tdf[tdf[0] == 0][1])]
    df2 = df[["x", "y2"]].iloc[np.array(tdf[tdf[0] == 1][1])]

[34]:

def idx_sample(npoints):
    tidx = []
    for i in range(npoints):
        tidx.append((np.random.randint(2), np.random.randint(7)))
    idx1 = []
    idx2 = []
    for t in tidx:
        if t[0] == 0:
            idx1.append(t[1])
        elif t[0] == 1:
            idx2.append(t[1])
        else:
            raise Exception("Must never occur")
    return idx1, idx2


for i in range(100):
    idx1, idx2 = idx_sample(14)
    df1 = (
        df[["x", "y1"]]
        .iloc[idx1]
        .sort_values(by="x", ascending=False)
        .reset_index(drop=True)
    )
    df2 = (
        df[["x", "y2"]]
        .iloc[idx2]
        .sort_values(by="x", ascending=False)
        .reset_index(drop=True)
    )

[35]:

n_points = len(df)
nboot = 199
np.random.seed(5)
best = lmfit.minimize(
    gobjective,
    pg,
    args=([df.x[1:], df.x], [df.y1[1:], df.y2], [titration_pH, titration_pH]),
)
nb = {k: [] for k in best.params.keys()}

for i in range(nboot):
    idx1, idx2 = idx_sample(13)
    df1 = (
        df[["x", "y1"]]
        .iloc[idx1]
        .sort_values(by="x", ascending=False)
        .reset_index(drop=True)
    )
    df2 = (
        df[["x", "y2"]]
        .iloc[idx2]
        .sort_values(by="x", ascending=False)
        .reset_index(drop=True)
    )
    # boot_idxs = np.random.randint(0, n_points, n_points)
    # df2 = df.iloc[boot_idxs]
    # df2=df2.sort_values(by="x", ascending=False).reset_index(drop=True)
    # # df2.reset_index(drop=True, inplace=True)
    # boot_idxs = np.random.randint(0, n_points, n_points)
    # df3 = df.iloc[boot_idxs]
    # # df3.reset_index(drop=True, inplace=True)
    # df3=df3.sort_values(by="x", ascending=False).reset_index(drop=True)
    try:
        out = lmfit.minimize(
            gobjective,
            best.params,
            args=([df1.x, df2.x], [df1.y1, df2.y2], [titration_pH, titration_pH]),
            calc_covar=False,
            method="leastsq",
            nan_policy="omit",
            scale_covar=False,
        )
        for k, v in out.params.items():
            nb[k].append(v.value)
    except:
        print(df1)
        print(df2)

# print(nb)

[36]:

np.quantile(nb["K_1"], [0.025, 0.5, 0.975])

[36]:

array([7.97738171, 8.07819767, 8.64995708])

[37]:

sb.kdeplot(data=nb, x="K_1", y="S1_2")

[37]:

<Axes: xlabel='K_1', ylabel='S1_2'>

[38]:

# nb.drop("K_2", axis=1, inplace=True)
with sb.axes_style("darkgrid"):
    g = sb.PairGrid(pd.DataFrame(nb), diag_sharey=False, vars=["K_1", "S1_1", "S1_2"])
    g.map_upper(plt.hexbin, bins="log", gridsize=20, cmap="Blues", mincnt=2)
    g.map_lower(sb.kdeplot, cmap="viridis_r", fill=True)
    g.map_diag(sb.histplot, kde=True)

[39]:

sb.violinplot(data=nb, x="K_1", split=True)

[39]:

<Axes: >

[40]:

g = sb.jointplot(
    y="S1_2",
    x="K_1",
    data=nb,
    marker="+",
    s=25,
    marginal_kws=dict(bins=25, fill=False, kde=True),
    color="#2075AA",
    marginal_ticks=True,
    height=5,
    ratio=2,
)
g.plot_joint(sb.kdeplot, color="r", zorder=0, levels=5)

[40]:

<seaborn.axisgrid.JointGrid at 0x7f5281c95b90>

[41]:

g = sb.JointGrid(data=nb, x="K_1", y="S1_2")
g.plot_joint(sb.histplot)
g.plot_marginals(sb.boxplot)

[41]:

<seaborn.axisgrid.JointGrid at 0x7f5282003190>

[42]:

f, (ax_box, ax_hist) = plt.subplots(
    2, sharex=True, gridspec_kw={"height_ratios": (0.25, 0.75)}
)

sb.histplot(data=nb, x="K_1", kde=True, ax=ax_hist)

sb.boxplot(x="K_1", data=nb, whis=[2.5, 97.5], ax=ax_box)
sb.stripplot(x="K_1", data=nb, color=".3", alpha=0.2, ax=ax_box)
ax_box.set(xlabel="")
f.tight_layout()
# ax = sb.violinplot(x="K_1", data=nb, inner=None, color="r")

[43]:

import corner

g = corner.corner(pd.DataFrame(nb)[["K_1", "S1_1", "S1_2"]], labels=list(nb.keys()))

WARNING:root:Pandas support in corner is deprecated; use ArviZ directly

1.4.5. lmfit.Model#

It took 9 vs 5 ms. It is not possible to do global fitting. In the documentation it is stressed the need to convert the output of the residue to be 1D vectors.

[44]:

mod = lmfit.models.ExpressionModel("(SB + SA * 10**(pK-x)) / (1 + 10**(pK-x))")
result = mod.fit(np.array(df.y1), x=np.array(df.x), pK=7, SB=7e3, SA=10000)
print(result.fit_report())

[[Model]]
    Model(_eval)
[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 44
    # data points      = 7
    # variables        = 3
    chi-square         = 12308015.2
    reduced chi-square = 3077003.79
    Akaike info crit   = 106.658958
    Bayesian info crit = 106.496688
    R-squared          = 0.97973543
[[Variables]]
    SB:  26638.9314 +/- 2456.05773 (9.22%) (init = 7000)
    SA:  54043.3812 +/- 979.984193 (1.81%) (init = 10000)
    pK:  8.06960356 +/- 0.14941163 (1.85%) (init = 7)
[[Correlations]] (unreported correlations are < 0.100)
    C(SB, pK) = -0.7750
    C(SA, pK) = -0.4552
    C(SB, SA) = +0.2046

[45]:

plt.plot(df.x, df.y1, "o")
plt.plot(df.x, result.init_fit, "--", label="initial fit")
plt.plot(df.x, result.best_fit, "-", label="best fit")
plt.legend()

[45]:

<matplotlib.legend.Legend at 0x7f527a1d2ad0>

[46]:

print(result.ci_report())

       99.73%    95.45%    68.27%    _BEST_    68.27%    95.45%    99.73%
 SB:-85235.83517-8376.49744-2895.6555126638.93141+2558.68054+5999.21905+12360.51318
 SA:-6192.83614-2734.32819-1098.2423854043.38116+1113.16062+2829.52103+6725.35644
 pK:  -0.98138  -0.40196  -0.15948   8.06960  +0.16277  +0.42592  +1.50919

which is faster but still I failed to find the way to global fitting.

[47]:

def tit_pH(x, S0, S1, K):
    return (S0 + S1 * 10 ** (K - x)) / (1 + 10 ** (K - x))


tit_model1 = lmfit.Model(tit_pH, prefix="ds1_")
tit_model2 = lmfit.Model(tit_pH, prefix="ds2_")
print(f"parameter names: {tit_model1.param_names}")
print(f"parameter names: {tit_model2.param_names}")
print(f"independent variables: {tit_model1.independent_vars}")
print(f"independent variables: {tit_model2.independent_vars}")

tit_model1.set_param_hint("K", value=7.0, min=2.0, max=12.0)
tit_model1.set_param_hint("S0", value=df.y1[0], min=0.0)
tit_model1.set_param_hint("S1", value=df.y1.iloc[-1], min=0.0)
tit_model2.set_param_hint("K", value=7.0, min=2.0, max=12.0)
tit_model2.set_param_hint("S0", value=df.y1[0], min=0.0)
tit_model2.set_param_hint("S1", value=df.y1.iloc[-1], min=0.0)
pars1 = tit_model1.make_params()
pars2 = tit_model2.make_params()
# gmodel = tit_model1 + tit_model2
# result = gmodel.fit(df.y1 + df.y2, pars, x=df.x)
res1 = tit_model1.fit(df.y1, pars1, x=df.x)
res2 = tit_model2.fit(df.y2, pars2, x=df.x)
print(res1.fit_report())
print(res2.fit_report())

parameter names: ['ds1_S0', 'ds1_S1', 'ds1_K']
parameter names: ['ds2_S0', 'ds2_S1', 'ds2_K']
independent variables: ['x']
independent variables: ['x']
[[Model]]
    Model(tit_pH, prefix='ds1_')
[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 25
    # data points      = 7
    # variables        = 3
    chi-square         = 12308015.2
    reduced chi-square = 3077003.79
    Akaike info crit   = 106.658958
    Bayesian info crit = 106.496688
    R-squared          = 0.97973543
[[Variables]]
    ds1_S0:  26638.8440 +/- 2455.92762 (9.22%) (init = 29657)
    ds1_S1:  54043.3607 +/- 979.995185 (1.81%) (init = 51205)
    ds1_K:   8.06961042 +/- 0.14940740 (1.85%) (init = 7)
[[Correlations]] (unreported correlations are < 0.100)
    C(ds1_S0, ds1_K)  = -0.7750
    C(ds1_S1, ds1_K)  = -0.4552
    C(ds1_S0, ds1_S1) = +0.2046
[[Model]]
    Model(tit_pH, prefix='ds2_')
[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 33
    # data points      = 7
    # variables        = 3
    chi-square         = 159980.530
    reduced chi-square = 39995.1326
    Akaike info crit   = 76.2582808
    Bayesian info crit = 76.0960112
    R-squared          = 0.99963719
[[Variables]]
    ds2_S0:  25135.9942 +/- 282.133911 (1.12%) (init = 29657)
    ds2_S1:  1485.53168 +/- 111.549888 (7.51%) (init = 51205)
    ds2_K:   8.07721983 +/- 0.01980096 (0.25%) (init = 7)
[[Correlations]] (unreported correlations are < 0.100)
    C(ds2_S0, ds2_K)  = +0.7768
    C(ds2_S1, ds2_K)  = +0.4545
    C(ds2_S0, ds2_S1) = +0.2051

[48]:

xfit_delta = (df.x.max() - df.x.min()) / 100
xfit = np.arange(df.x.min() - xfit_delta, df.x.max() + xfit_delta, xfit_delta)
dely1 = res1.eval_uncertainty(x=xfit) * 1
dely2 = res2.eval_uncertainty(x=xfit) * 1
best_fit1 = res1.eval(x=xfit)
best_fit2 = res2.eval(x=xfit)
plt.plot(df.x, df.y1, "o")
plt.plot(df.x, df.y2, "o")
plt.plot(xfit, best_fit1, "-.")
plt.plot(xfit, best_fit2, "-.")
plt.fill_between(xfit, best_fit1 - dely1, best_fit1 + dely1, color="#FEDCBA", alpha=0.5)
plt.fill_between(xfit, best_fit2 - dely2, best_fit2 + dely2, color="#FEDCBA", alpha=0.5)

[48]:

<matplotlib.collections.PolyCollection at 0x7f527a0fc590>

Please mind the difference in the uncertainty between the 2 label blocks.

[49]:

def tit_pH2(x, S0_1, S0_2, S1_1, S1_2, K):
    y1 = (S0_1 + S1_1 * 10 ** (K - x)) / (1 + 10 ** (K - x))
    y2 = (S0_2 + S1_2 * 10 ** (K - x)) / (1 + 10 ** (K - x))
    # return y1, y2
    return np.r_[y1, y2]


tit_model = lmfit.Model(tit_pH2)
tit_model.set_param_hint("K", value=7.0, min=2.0, max=12.0)
tit_model.set_param_hint("S0_1", value=df.y1[0], min=0.0)
tit_model.set_param_hint("S0_2", value=df.y2[0], min=0.0)
tit_model.set_param_hint("S1_1", value=df.y1.iloc[-1], min=0.0)
tit_model.set_param_hint("S1_2", value=df.y2.iloc[-1], min=0.0)
pars = tit_model.make_params()
# res = tit_model.fit([df.y1, df.y2], pars, x=df.x)
res = tit_model.fit(np.r_[df.y1, df.y2], pars, x=df.x)
print(res.fit_report())

[[Model]]
    Model(tit_pH2)
[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 37
    # data points      = 14
    # variables        = 5
    chi-square         = 12471473.3
    reduced chi-square = 1385719.25
    Akaike info crit   = 201.798560
    Bayesian info crit = 204.993846
    R-squared          = 0.99794717
[[Variables]]
    S0_1:  26601.3458 +/- 1425.69913 (5.36%) (init = 29657)
    S0_2:  25084.4189 +/- 1337.07982 (5.33%) (init = 22885)
    S1_1:  54034.5806 +/- 627.642479 (1.16%) (init = 51205)
    S1_2:  1473.57871 +/- 616.944649 (41.87%) (init = 1358)
    K:     8.07255029 +/- 0.07600777 (0.94%) (init = 7)
[[Correlations]] (unreported correlations are < 0.100)
    C(S0_1, K)    = -0.6816
    C(S0_2, K)    = +0.6255
    C(S0_1, S0_2) = -0.4264
    C(S1_1, K)    = -0.3611
    C(S1_2, K)    = +0.3161
    C(S0_2, S1_1) = -0.2259
    C(S0_1, S1_2) = -0.2155
    C(S1_1, S1_2) = -0.1141

[50]:

dely = res.eval_uncertainty(x=xfit)
# res.plot() # this return error because of the global fit

[51]:

def fit_pH(fp):
    df = pd.read_csv(fp)

    def tit_pH(x, SA, SB, pK):
        return (SB + SA * 10 ** (pK - x)) / (1 + 10 ** (pK - x))

    mod = lmfit.Model(tit_pH)
    pars = mod.make_params(SA=10000, SB=7e3, pK=7)
    result = mod.fit(df.y2, pars, x=df.x)
    return result, df.y2, df.x, mod


# r,y,x,model = fit_pH("/home/dati/ibf/IBF/Database/Random mutag results/Liasan-analyses/2016-05-19/2014-02-20/pH/dat/C12.dat")
r, y, x, model = fit_pH("../../tests/data/H04.dat")
xfit = np.linspace(x.min(), x.max(), 50)
dely = r.eval_uncertainty(x=xfit) * 1
best_fit = r.eval(x=xfit)
plt.plot(x, y, "o")
plt.plot(xfit, best_fit, "-.")
plt.fill_between(xfit, best_fit - dely, best_fit + dely, color="#FEDCBA", alpha=0.5)
r.conf_interval(sigmas=[2])
print(r.ci_report(with_offset=False, ndigits=2))

    99.73% 95.45% 68.27% _BEST_ 68.27% 95.45% 99.73%
 SA:2338.404511.625450.156052.536642.137512.329321.96
 SB:33406.9634609.8235170.9935544.4435920.0736492.8937756.24
 pK:   6.47   6.60   6.66   6.70   6.74   6.80   6.93

[52]:

g = r.plot()

[53]:

print(r.ci_report())

       99.73%    95.45%    68.27%    _BEST_    68.27%    95.45%    99.73%
 SA:-3714.13150-1540.91238-602.378536052.53269+589.59734+1459.78928+3269.42485
 SB:-2137.47758-934.62678-373.4502035544.44185+375.62906+948.44608+2211.79840
 pK:  -0.23398  -0.10021  -0.03976   6.70123  +0.03971  +0.09989  +0.23227

[54]:

emcee_kws = dict(steps=2000, burn=500, thin=2, is_weighted=False, progress=False)
emcee_params = r.params.copy()
emcee_params.add("__lnsigma", value=np.log(0.1), min=np.log(0.001), max=np.log(2000.0))
result_emcee = model.fit(
    data=y,
    x=x,
    params=emcee_params,
    method="emcee",
    nan_policy="omit",
    fit_kws=emcee_kws,
)

lmfit.report_fit(result_emcee)

The chain is shorter than 50 times the integrated autocorrelation time for 4 parameter(s). Use this estimate with caution and run a longer chain!
N/50 = 40;
tau: [44.76109263 47.08681846 42.75125412 89.09478397]
[[Fit Statistics]]
    # fitting method   = emcee
    # function evals   = 200000
    # data points      = 7
    # variables        = 4
    chi-square         = 3.40256279
    reduced chi-square = 1.13418760
    Akaike info crit   = 2.95033132
    Bayesian info crit = 2.73397192
    R-squared          = 1.00000000
[[Variables]]
    SA:         6045.63837 +/- 595.804798 (9.86%) (init = 6052.533)
    SB:         35540.8754 +/- 365.813058 (1.03%) (init = 35544.44)
    pK:         6.70082176 +/- 0.03946928 (0.59%) (init = 6.701226)
    __lnsigma:  6.29181634 +/- 0.38021005 (6.04%) (init = -2.302585)
[[Correlations]] (unreported correlations are < 0.100)
    C(SA, pK) = +0.7193
    C(SB, pK) = +0.5145
    C(SA, SB) = +0.2075

[55]:

result_emcee.plot_fit()

[55]:

<Axes: title={'center': 'Model(tit_pH)'}, xlabel='x', ylabel='y'>

[56]:

emcee_corner = corner.corner(
    result_emcee.flatchain,
    labels=result_emcee.var_names,
    truths=list(result_emcee.params.valuesdict().values()),
)

WARNING:root:Pandas support in corner is deprecated; use ArviZ directly

[57]:

highest_prob = np.argmax(result_emcee.lnprob)
hp_loc = np.unravel_index(highest_prob, result_emcee.lnprob.shape)
mle_soln = result_emcee.chain[hp_loc]
print("\nMaximum Likelihood Estimation (MLE):")
print("----------------------------------")
for ix, param in enumerate(emcee_params):
    print(f"{param}: {mle_soln[ix]:.3f}")

quantiles = np.percentile(result_emcee.flatchain["pK"], [2.28, 15.9, 50, 84.2, 97.7])
print(f"\n\n1 sigma spread = {0.5 * (quantiles[3] - quantiles[1]):.3f}")
print(f"2 sigma spread = {0.5 * (quantiles[4] - quantiles[0]):.3f}")


Maximum Likelihood Estimation (MLE):
----------------------------------
SA: 6081.043
SB: 35529.945
pK: 6.703
__lnsigma: 5.930


1 sigma spread = 0.040
2 sigma spread = 0.096

1.5. Model: example 2P Cl–ratio#

[60]:

def fit_Rcl(fp):
    df = pd.read_table(fp)

    def R_Cl(cl, R0, R1, Kd):
        return (R1 * cl + R0 * Kd) / (Kd + cl)

    mod = lmfit.Model(R_Cl)
    pars = mod.make_params(R0=0.8, R1=0.05, Kd=10)
    result = mod.fit(df.R, pars, cl=df.cl)
    return result, df.R, df.cl, mod


r, y, x, model = fit_Rcl("../../tests/data/ratio2P.txt")
xfit = np.linspace(x.min(), x.max(), 50)
dely = r.eval_uncertainty(cl=xfit) * 3
best_fit = r.eval(cl=xfit)
plt.plot(x, y, "o")
plt.grid()
plt.plot(xfit, best_fit, "-.")
plt.fill_between(xfit, best_fit - dely, best_fit + dely, color="#FEDCBA", alpha=0.5)
r.conf_interval(sigmas=[2])
print(r.ci_report(with_offset=False, ndigits=2))

    99.73% 95.45% 68.27% _BEST_ 68.27% 95.45% 99.73%
 R0:   0.49   0.58   0.60   0.61   0.62   0.64   0.73
 R1:  -0.30  -0.01   0.03   0.04   0.06   0.09   0.20
 Kd:   2.95  10.09  12.51  13.66  14.91  18.49  59.97

[67]:

emcee_kws = dict(is_weighted=False, progress=False)
emcee_params = r.params.copy()
emcee_params.add(
    "__lnsigma", value=np.log(0.1), min=np.log(0.000001), max=np.log(2000.0)
)
result_emcee = model.fit(
    data=y,
    cl=x,
    params=emcee_params,
    method="emcee",
    nan_policy="omit",
    fit_kws=emcee_kws,
)

lmfit.report_fit(result_emcee)

The chain is shorter than 50 times the integrated autocorrelation time for 4 parameter(s). Use this estimate with caution and run a longer chain!
N/50 = 20;
tau: [39.14220672 38.36658557 38.7322156  67.05237463]
[[Fit Statistics]]
    # fitting method   = emcee
    # function evals   = 100000
    # data points      = 5
    # variables        = 4
    chi-square         = 1.08017724
    reduced chi-square = 1.08017724
    Akaike info crit   = 0.33843612
    Bayesian info crit = -1.22381223
    R-squared          = -6.60071574
[[Variables]]
    R0:         0.60626860 +/- 0.00979297 (1.62%) (init = 0.6071065)
    R1:         0.04307811 +/- 0.01548680 (35.95%) (init = 0.04390399)
    Kd:         13.7802460 +/- 1.41767925 (10.29%) (init = 13.66125)
    __lnsigma: -4.76389554 +/- 0.79262984 (16.64%) (init = -2.302585)
[[Correlations]] (unreported correlations are < 0.100)
    C(R1, Kd)        = -0.8461
    C(R0, Kd)        = -0.5866
    C(R0, R1)        = +0.4901
    C(Kd, __lnsigma) = +0.3430
    C(R1, __lnsigma) = -0.3278
    C(R0, __lnsigma) = -0.2640

[68]:

emcee_corner = corner.corner(
    result_emcee.flatchain,
    labels=result_emcee.var_names,
    truths=list(result_emcee.params.valuesdict().values()),
)

WARNING:root:Pandas support in corner is deprecated; use ArviZ directly
WARNING:root:Too few points to create valid contours
WARNING:root:Too few points to create valid contours
WARNING:root:Too few points to create valid contours

[69]:

highest_prob = np.argmax(result_emcee.lnprob)
hp_loc = np.unravel_index(highest_prob, result_emcee.lnprob.shape)
mle_soln = result_emcee.chain[hp_loc]
print("\nMaximum Likelihood Estimation (MLE):")
print("----------------------------------")
for ix, param in enumerate(emcee_params):
    print(f"{param}: {mle_soln[ix]:.3f}")

quantiles = np.percentile(result_emcee.flatchain["Kd"], [2.28, 15.9, 50, 84.2, 97.7])
print(f"\n\n1 sigma spread = {0.5 * (quantiles[3] - quantiles[1]):.3f}")
print(f"2 sigma spread = {0.5 * (quantiles[4] - quantiles[0]):.3f}")


Maximum Likelihood Estimation (MLE):
----------------------------------
R0: 0.607
R1: 0.044
Kd: 13.649
__lnsigma: -5.596


1 sigma spread = 1.421
2 sigma spread = 7.719