1. Fitting#

Requires:

  • r-4.1.3-1

1.1. Conventions and best approaches#

  • S0 Signal for unbound state

  • S1 Signal for bound state

  • K equilibrium constant (Kd or pKa)

  • order data from unbound to bound (e.g. cl: 0–>150 mM; pH 9–>5)

  • lmfit.Model has very convenient results plot functionalities and the unique possibility to estimate upper lower fitting curves.

  • R nls and nlsboot seems very convenient (Q-Q plt) and fast; nls can perform global fit but nlstools can not.

1.2. Initial imports#

import numpy as np
import scipy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import rpy2
from rpy2.robjects import r
from rpy2.robjects.packages import importr
from rpy2.robjects import globalenv
from rpy2.robjects import pandas2ri
import lmfit

pandas2ri.activate()
%load_ext rpy2.ipython

MASS = importr('MASS')
#r('library(MASS)')
The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

1.3. Single Cl titration.#

df = pd.read_table("../../tests/data/copyIP.txt")
sb.scatterplot(data=df, x="cl", y="F", hue=df.cl*df.F, palette="crest", s=200, alpha=.8, legend=False)
../_images/f01.png

1.3.1. Using R#

d <- read.delim("../../tests/data/copyIP.txt")
fit = nls(F ~ (S0 + S1 * cl / Kd)/ (1 + cl / Kd), start = list(S0=7e7, S1=0, Kd=12), data=d)
summary(fit)
Formula: F ~ (S0 + S1 * cl/Kd)/(1 + cl/Kd)

Parameters:
    Estimate Std. Error t value Pr(>|t|)
S0 1.341e+07  8.713e+04 153.894 5.08e-12 ***
S1 5.635e+05  1.064e+05   5.296  0.00184 **
Kd 5.832e+01  2.247e+00  25.957 2.16e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 118200 on 6 degrees of freedom

Number of iterations to convergence: 6
Achieved convergence tolerance: 1.608e-06
confint(fit)
Waiting for profiling to be done...
           2.5%        97.5%
S0 1.319762e+07 1.362273e+07
S1 3.009078e+05 8.199737e+05
Kd 5.313128e+01 6.407096e+01
library(nlstools)
set.seed(4)
nb = nlsBoot(fit, niter=999)
summary(nb)
------
Bootstrap statistics
       Estimate   Std. error
S0 1.341194e+07 70877.210905
S1 5.662159e+05 88287.461199
Kd 5.827538e+01     1.834132

------
Median of bootstrap estimates and percentile confidence intervals
         Median         2.5%        97.5%
S0 1.341055e+07 1.328220e+07 1.354591e+07
S1 5.662406e+05 4.108650e+05 7.369380e+05
Kd 5.826980e+01 5.477262e+01 6.188986e+01
plot(nb)
../_images/r_bs.png

1.3.2. using rpy2#

globalenv['Rdf'] = df

fit = rpy2.robjects.r('nls(F ~ (S0 + S1 * cl / Kd)/ (1 + cl / Kd), start = list(S0=7e7, S1=0, Kd=12), data=Rdf) ')
globalenv['rfit'] = fit
print(r.summary(fit))
print(r.confint(fit))
%R print("")
%R print(confint(rfit))
R[write to console]: Waiting for profiling to be done...

Formula: F ~ (S0 + S1 * cl/Kd)/(1 + cl/Kd)

Parameters:
    Estimate Std. Error t value Pr(>|t|)
S0 1.341e+07  8.713e+04 153.894 5.08e-12 ***
S1 5.635e+05  1.064e+05   5.296  0.00184 **
Kd 5.832e+01  2.247e+00  25.957 2.16e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 118200 on 6 degrees of freedom

Number of iterations to convergence: 6
Achieved convergence tolerance: 1.608e-06


[[1.31976181e+07 1.36227292e+07]
 [3.00907829e+05 8.19973684e+05]
 [5.31312807e+01 6.40709639e+01]]
[1] ""
R[write to console]: Waiting for profiling to be done...
           2.5%        97.5%
S0 1.319762e+07 1.362273e+07
S1 3.009078e+05 8.199737e+05
Kd 5.313128e+01 6.407096e+01
array([[1.31976181e+07, 1.36227292e+07],
       [3.00907829e+05, 8.19973684e+05],
       [5.31312807e+01, 6.40709639e+01]])

With older versions Rpy2 output looked nicer

print(MASS.confint_nls(fit, 'Kd'))
print(rpy2.robjects.r('summary(rfit)'))
R[write to console]: Waiting for profiling to be done...
[53.13128065 64.07096394]

Formula: F ~ (S0 + S1 * cl/Kd)/(1 + cl/Kd)

Parameters:
    Estimate Std. Error t value Pr(>|t|)
S0 1.341e+07  8.713e+04 153.894 5.08e-12 ***
S1 5.635e+05  1.064e+05   5.296  0.00184 **
Kd 5.832e+01  2.247e+00  25.957 2.16e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 118200 on 6 degrees of freedom

Number of iterations to convergence: 6
Achieved convergence tolerance: 1.608e-06
nlstools = importr('nlstools')
base = importr('base')
base.set_seed(4)
nb = nlstools.nlsBoot(fit, niter=999)
globalenv['nb'] = nb
globalenv['fit'] = fit
%%R
plot(nb)
summary(nb)
------
Bootstrap statistics
       Estimate   Std. error
S0 1.341194e+07 70877.210905
S1 5.662159e+05 88287.461199
Kd 5.827538e+01     1.834132

------
Median of bootstrap estimates and percentile confidence intervals
         Median         2.5%        97.5%
S0 1.341055e+07 1.328220e+07 1.354591e+07
S1 5.662406e+05 4.108650e+05 7.369380e+05
Kd 5.826980e+01 5.477262e+01 6.188986e+01
../_images/rpy_bs.png

1.3.3. lmfit#

import lmfit

def residual(pars, x, y=None):
    S0 =  pars['S0']
    S1 =  pars['S1']
    Kd = pars['Kd']
    model = (S0 + S1 * x / Kd) / (1 + x / Kd)
    if y is None:
        return model
    return (y - model)

params = lmfit.Parameters()
params.add('S0', value=df.F[0])
params.add('S1', value=100)
params.add('Kd', value=50, vary=True)

out = lmfit.minimize(residual, params, args=(df.cl, df.F,))

xdelta = (df.cl.max() - df.cl.min()) / 500
xfit = np.arange(df.cl.min() - xdelta, df.cl.max() + xdelta, xdelta)
yfit = residual(out.params, xfit)
print(lmfit.fit_report(out.params))
plt.plot(df.cl, df.F, "o", xfit, yfit, "-")
[[Variables]]
    S0:  13408867.7 +/- 87130.4207 (0.65%) (init = 1.33579e+07)
    S1:  563536.896 +/- 106411.773 (18.88%) (init = 100)
    Kd:  58.3187813 +/- 2.24670302 (3.85%) (init = 50)
[[Correlations]] (unreported correlations are < 0.100)
    C(S1, Kd) = -0.712
    C(S0, Kd) = -0.656
    C(S0, S1) = 0.275
../_images/lmfit1.png
import lmfit
def residuals(p):
    S0 =  p['S0']
    S1 =  p['S1']
    Kd = p['Kd']
    model = (S0 + S1 * df.cl / Kd) / (1 + df.cl / Kd)
    return (model - df.F)

mini = lmfit.Minimizer(residuals, params)
res = mini.minimize()
ci, tr = lmfit.conf_interval(mini, res, sigmas=[.68, .95], trace=True)
print(lmfit.ci_report(ci, with_offset=False, ndigits=2))
print(lmfit.fit_report(res, show_correl=False, sort_pars=True))
    95.00% 68.00% _BEST_ 68.00% 95.00%
 S0:13197616.3413314946.3513408867.6813503300.3813622729.13
 S1:300911.47447991.63563536.93677869.66819977.61
 Kd:  53.13  55.96  58.32  60.79  64.07
[[Fit Statistics]]
    # fitting method   = leastsq
    # function evals   = 17
    # data points      = 9
    # variables        = 3
    chi-square         = 8.3839e+10
    reduced chi-square = 1.3973e+10
    Akaike info crit   = 212.594471
    Bayesian info crit = 213.186145
[[Variables]]
    Kd:  58.3187808 +/- 2.24670301 (3.85%) (init = 50)
    S0:  13408867.7 +/- 87130.4216 (0.65%) (init = 1.33579e+07)
    S1:  563536.932 +/- 106411.771 (18.88%) (init = 100)
names = res.params.keys()
i = 0
gs = plt.GridSpec(4, 4)
sx = {}
sy = {}
for fixed in names:
    j = 0
    for free in names:
        if j in sx and i in sy:
            ax = plt.subplot(gs[i, j], sharex=sx[j], sharey=sy[i])
        elif i in sy:
            ax = plt.subplot(gs[i, j], sharey=sy[i])
            sx[j] = ax
        elif j in sx:
            ax = plt.subplot(gs[i, j], sharex=sx[j])
            sy[i] = ax
        else:
            ax = plt.subplot(gs[i, j])
            sy[i] = ax
            sx[j] = ax
        if i < 3:
            plt.setp(ax.get_xticklabels(), visible=True)
        else:
            ax.set_xlabel(free)

        if j > 0:
            plt.setp(ax.get_yticklabels(), visible=False)
        else:
            ax.set_ylabel(fixed)

        rest = tr[fixed]
        prob = rest['prob']
        f = prob < 0.96

        x, y = rest[free], rest[fixed]
        ax.scatter(x[f], y[f], c=1-prob[f], s=25*(1-prob[f]+0.5))
        ax.autoscale(1, 1)
        j += 1
    i += 1
../_images/lmfit2.png
names = list(res.params.keys())

plt.figure()
for i in range(3):
    for j in range(3):
        indx = 9-j*3-i
        ax = plt.subplot(3, 3, indx)
        ax.ticklabel_format(style='sci', scilimits=(-2, 2), axis='y')

        # set-up labels and tick marks
        ax.tick_params(labelleft=False, labelbottom=False)
        if indx in (1, 4, 7):
            plt.ylabel(names[j])
            ax.tick_params(labelleft=True)
        if indx == 1:
            ax.tick_params(labelleft=True)
        if indx in (7, 8, 9):
            plt.xlabel(names[i])
            ax.tick_params(labelbottom=True)
            [label.set_rotation(45) for label in ax.get_xticklabels()]

        if i != j:
            x, y, m = lmfit.conf_interval2d(mini, res, names[i], names[j], 20, 20)
            plt.contourf(x, y, m, np.linspace(0, 1, 10))

            x = tr[names[i]][names[i]]
            y = tr[names[i]][names[j]]
            pr = tr[names[i]]['prob']
            s = np.argsort(x)
            plt.scatter(x[s], y[s], c=pr[s], s=30, lw=1)

        else:
            x = tr[names[i]][names[i]]
            y = tr[names[i]]['prob']

            t, s = np.unique(x, True)
            f = scipy.interpolate.interp1d(t, y[s], 'slinear')
            xn = np.linspace(x.min(), x.max(), 50)
            plt.plot(xn, f(xn), lw=1)
            plt.ylabel('prob')
            ax.tick_params(labelleft=True)

plt.tight_layout()
../_images/lmfit3.png
lmfit.report_fit(out.params, min_correl=0.25)

ci, trace = lmfit.conf_interval(mini, res, sigmas=[1, 2], trace=True)
lmfit.printfuncs.report_ci(ci)

fig, axes = plt.subplots(2, 2, figsize=(12.8, 9.6), sharey=True)
cx1, cy1, prob = trace['S0']['S0'], trace['S0']['Kd'], trace['S0']['prob']
cx2, cy2, prob2 = trace['S1']['S1'], trace['S1']['Kd'], trace['S1']['prob']

axes[0][0].scatter(cx1, cy1, c=prob, s=30)
axes[0][0].set_xlabel('S0')
axes[0][0].set_ylabel('Kd')

axes[0][1].scatter(cx2, cy2, c=prob2, s=30)
axes[0][1].set_xlabel('S1')

cx, cy, grid = lmfit.conf_interval2d(mini, res, 'S0', 'Kd', 30, 30)
ctp = axes[1][0].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[1][0])
axes[1][0].set_xlabel('S0')
axes[1][0].set_ylabel('Kd')

cx, cy, grid = lmfit.conf_interval2d(mini, res, 'S1', 'Kd', 30, 30)
ctp = axes[1][1].contourf(cx, cy, grid, np.linspace(0, 1, 11))
fig.colorbar(ctp, ax=axes[1][1])
axes[1][1].set_xlabel('S1')
axes[1][1].set_ylabel('Kd')
x, y, grid = lmfit.conf_interval2d(mini, res, 'S0','S1', 30, 30)
plt.contourf(x, y, grid, np.linspace(0,1,11))
plt.xlabel('S0')
plt.colorbar()
plt.ylabel('S1')
../_images/lmfit5.png
[[Variables]]
    S0:  13408867.7 +/- 87130.4207 (0.65%) (init = 1.33579e+07)
    S1:  563536.896 +/- 106411.773 (18.88%) (init = 100)
    Kd:  58.3187813 +/- 2.24670302 (3.85%) (init = 50)
[[Correlations]] (unreported correlations are < 0.250)
    C(S1, Kd) = -0.712
    C(S0, Kd) = -0.656
    C(S0, S1) = 0.275
       95.45%    68.27%    _BEST_    68.27%    95.45%
 S0:-217226.57750-94491.3408813408867.68157+95008.90246+219989.07757
 S1:-270193.18488-116251.92509563536.93239+115024.55042+263650.95181
 Kd:  -5.32812  -2.37783  58.31878  +2.48963  +5.92424
../_images/lmfit4.png

1.4. Notes#

You could implement global fitting using scipy.leastq but will sometime fail in bootstrapping. lmfit resulted much more robust

def fit_pH_global(fz, x, dy1, dy2):
    """Fit 2 dataset (x, y1, y2) with a single protonation site model
    """
    y1 = np.array(dy1)
    y2 = np.array(dy2)

    def ssq(p, x, y1, y2):
        return np.r_[y1 - fz(p[0], p[1:3], x), y2 - fz(p[0], p[3:5], x)]
    p0 = np.r_[x[2], y1[0], y1[-1], y2[0], y2[-1]]
    p, cov, info, msg, success = optimize.leastsq(ssq, p0, args=(x, y1, y2),
                                                  full_output=True, xtol=1e-11)
    res = namedtuple("Result", "success msg df chisqr K sK SA_1 sSA_1 \
                     SB_1 sSB_1 SA_2 sSA_2 SB_2 sSB_2")
    res.msg = msg
    res.success = success
    if 1 <= success <= 4:
        chisq = sum(info['fvec'] * info['fvec'])
        res.df = len(y1) + len(y2) - len(p)
        res.chisqr = chisq / res.df
        res.K = p[0]
        #res.sK = np.sqrt(cov[0][0] * res.chisqr)
        res.SA_1 = p[1]
        #res.sSA_1 = np.sqrt(cov[1][1] * res.chisqr)
        res.SB_1 = p[2]
        #res.sSB_1 = np.sqrt(cov[2][2] * res.chisqr)
        res.SA_2 = p[3]
        #res.sSA_2 = np.sqrt(cov[3][3] * res.chisqr)
        res.SB_2 = p[4]
        #res.sSB_2 = np.sqrt(cov[4][4] * res.chisqr)
    return res

result = fit_pH_global(fz, df.x, df.y1, df.y2)