Source code for fennomix_mhc.tda_fmm

from typing import Any, Optional

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gamma as dgamma

# Constants
BIC_Max: float = 1e100
Div_Zero: float = 1e-50
Max_Iter: int = 30



[docs]
def gauss_pdf(
    X: np.ndarray | list[float] | float, u: float, sigma: float
) -> np.ndarray:
    """Calculate Gaussian probability density function values for input array X.

    Args:
        X: Input array of shape (n,) representing scores.
        u: Mean (mu) of the Gaussian distribution.
        sigma: Standard deviation (sigma) of the Gaussian distribution.

    Returns:
        Array of PDF values with same shape as X.
    """
    return 1 / (np.sqrt(2 * np.pi) * sigma) * np.exp(-0.5 * np.square((X - u) / sigma))




[docs]
def gamma_pdf(
    X: np.ndarray | list[float] | float, u: float, sigma: float
) -> np.ndarray:
    """Calculate Gamma probability density function values using mean and std.

    The shape and scale parameters are derived from mean (u) and std (sigma).

    Args:
        X: Input array of shape (n,) representing scores.
        u: Mean of the distribution.
        sigma: Standard deviation of the distribution.

    Returns:
        Array of Gamma PDF values with same shape as X.
    """
    a = (u / sigma) ** 2
    scale = sigma**2 / u
    return dgamma.pdf(X, a=a, scale=scale)




[docs]
class TDA_fmm:
    """Finite Mixture Model (FMM) for Target-Decoy Analysis (TDA).

    This class estimates score distributions using a mixture of Gaussians.
    It supports modeling both target and decoy distributions, where the decoy
    model can be incorporated as an external component in the target model.

    Attributes:
        n_components: Number of Gaussian components in the mixture.
        external_model: Optional fitted decoy model (for target modeling).
        max_iter: Maximum number of EM iterations.
        main_pdf: PDF function used for the first component.
        helper_pdf: PDF function used for other components.
        weights: Learned mixture weights (pi_k).
        mu: Learned means for each component.
        sigma: Learned standard deviations for each component.
    """


[docs]
    def __init__(
        self, n_components: int, external_model: Optional["TDA_fmm"] = None
    ) -> None:
        """Initializes the TDA_fmm model.

        Args:
            n_components: Number of Gaussian components in the mixture.
            external_model: Pre-fitted decoy model. If None, models decoy;
                           if provided, models target with decoy as a component.
        """
        self.n_components: int = n_components
        self.external_model: "TDA_fmm" | None = external_model
        self.max_iter: int = Max_Iter
        if external_model is None:
            self.main_pdf: Any = (
                gauss_pdf  # Callable[[ArrayLike, float, float], np.ndarray]
            )
        else:
            self.main_pdf: Any = gauss_pdf
        self.helper_pdf: Any = gauss_pdf

        # Initialize parameters; set to None until fit()
        self.weights: np.ndarray | None = None
        self.mu: np.ndarray | None = None
        self.sigma: np.ndarray | None = None


    def __call__(self, X: np.ndarray | list[float]) -> np.ndarray:
        return self.pdf_mix(X)


[docs]
    def pep(
        self,
        X: np.ndarray | list[float],
        external_pdf: np.ndarray | None = None,
    ) -> np.ndarray:
        """Estimates Posterior Error Probabilities (PEP).

        PEP = pi0 * f_decoy(x) / f_mixture(x)

        Args:
            X: Input scores.
            external_pdf: Optional precomputed PDF values from external model.
                          If None and external_model exists, it will be computed.

        Returns:
            Array of PEP values for each score in X. Returns zeros if model not fitted.
        """
        if self.weights is None or self.external_model is None:
            return np.zeros(len(X))
        if external_pdf is None:
            external_pdf = self.external_model.pdf(X)
        return self.weights[-1] * external_pdf / self.pdf_mix(X, external_pdf)



[docs]
    def get_pi0(self) -> float:
        """Returns the estimated proportion of decoy (null) components in the mixture.

        Returns:
            pi0 value (between 0 and 1). Returns 0 if model not fitted or no external model.
        """
        if self.weights is None or self.external_model is None:
            return 0.0
        else:
            return float(self.weights[-1])



[docs]
    def pdf(self, X: np.ndarray | list[float]) -> np.ndarray:
        """Computes the PDF of the main mixture components (excluding external model).

        Args:
            X: Input scores of shape (n,).

        Returns:
            PDF values of shape (n,). Returns zeros if model not fitted.
        """
        if self.weights is None:
            return np.zeros(len(X))
        X_arr: np.ndarray = np.array(X)
        n_samples: int = X_arr.shape[0]
        pdf: np.ndarray = np.zeros((self.n_components, n_samples))
        for i in range(0, self.n_components):
            if i == 0:
                pdf[i, :] = self.main_pdf(X_arr, u=self.mu[i], sigma=self.sigma[i])
            else:
                pdf[i, :] = self.helper_pdf(X_arr, u=self.mu[i], sigma=self.sigma[i])
        return np.dot(self.weights[: self.n_components], pdf)



[docs]
    def pdf_mix(
        self,
        X: np.ndarray | list[float],
        external_pdf: np.ndarray | None = None,
    ) -> np.ndarray:
        """Computes the full mixture PDF, including external model if present.

        f_mixture(x) = pi0 * f_decoy(x) + (1-pi0) * f_target(x)

        Args:
            X: Input scores.
            external_pdf: Optional precomputed PDF values from external model.

        Returns:
            Mixture PDF values. Returns zeros if model not fitted.
        """
        if self.weights is None:
            return np.zeros(len(X))
        X_arr: np.ndarray = np.array(X)
        if self.external_model is not None:
            if external_pdf is None:
                external_pdf = self.external_model.pdf(X_arr)
            pdf0: np.ndarray = external_pdf * self.get_pi0()
            pdf1: np.ndarray = self.pdf(X_arr) * (1 - self.get_pi0())
            return pdf0 + pdf1
        else:
            return self.pdf(X_arr)



[docs]
    def loglik_BIC(self, X: np.ndarray | list[float]) -> tuple[float, float]:
        """Computes log-likelihood and Bayesian Information Criterion (BIC).

        BIC = -2 * loglik + num_params * log(n)

        Args:
            X: Input scores.

        Returns:
            A tuple of (log-likelihood, BIC). Returns (0, 0) if model not fitted.
        """
        if self.weights is None:
            return 0.0, 0.0
        _has_external: int = 1 if self.external_model is not None else 0
        loglik: float = float(np.sum(np.log(self.pdf(X))))
        n: int = len(X)
        BIC: float = -2 * loglik + (self.n_components + _has_external) * np.log(n)
        return loglik, BIC



[docs]
    def plot(
        self,
        title: str,
        plot_scores: np.ndarray | list[float],
        false_scores: np.ndarray | list[float] | None = None,
    ) -> None:
        """Plots the fitted mixture model against histogram of scores.

        If an external model exists and false_scores are provided, plots:
          - Decoy model (external)
          - Target histogram + mixture fit
          - Separated true and false components

        Otherwise, plots only the decoy model fit.

        Args:
            title: Title prefix for plots.
            plot_scores: Scores to plot (e.g., target scores).
            false_scores: Optional decoy scores for comparison.
        """
        binsize: int = 40
        lw: float = 1.0

        if self.external_model is not None and false_scores is not None:
            self.external_model.plot(title, false_scores)
            scores: np.ndarray = np.array(plot_scores)
            binsize *= 2

            step: float = 0.001
            plt.figure()
            pp = plt.subplot()

            n, bins, patches = pp.hist(
                scores, binsize, density=1, facecolor="green", alpha=0.75
            )
            bins_arr: np.ndarray = np.arange(scores.min(), scores.max(), step)
            dis: np.ndarray = self.pdf_mix(bins_arr)
            ratio: float = np.mean(n) / np.mean(dis)
            dis = dis * ratio
            pp.plot(bins_arr, dis, "r--", linewidth=lw)
            pp.set_title(title + " target")

            # plot false and true distribution separately
            plt.figure()
            pp = plt.subplot()
            dis_false: np.ndarray = self.external_model.pdf(bins_arr)
            dis_true: np.ndarray = self.pdf(bins_arr)
            pp.hist(scores, binsize, density=1, facecolor="green", alpha=0.75)
            dis_false = dis_false * ratio * self.get_pi0()
            dis_true = dis_true * ratio * (1 - self.get_pi0())
            pp.plot(bins_arr, dis_true, "b--", linewidth=lw)
            pp.plot(bins_arr, dis_false, "r--", linewidth=lw)
            pp.set_title(
                title
                + rf" target mixture ($\pi_0$={self.get_pi0():.2f}, $\pi_1$={1 - self.get_pi0():.2f})"
            )

        else:
            scores: np.ndarray = np.array(plot_scores)

            step: float = 0.1
            plt.figure()
            pp = plt.subplot()

            n, bins, patches = pp.hist(
                scores, binsize, density=1, facecolor="blue", alpha=0.75
            )
            bins_arr: np.ndarray = np.arange(scores.min(), scores.max(), step)
            dis: np.ndarray = self.pdf(bins_arr)
            dis = dis * np.mean(n) / np.mean(dis)
            pp.plot(bins_arr, dis, "r--", linewidth=lw)
            pp.set_title(title + " decoy")



[docs]
    def fit(self, X: np.ndarray | list[float]) -> None:
        """Fits the FMM model using Expectation-Maximization (EM) algorithm.

        Args:
            X: Input scores to fit the model on.
        """
        if len(X) < 10:
            self.weights = None
            return
        X_arr: np.ndarray = np.array(X)
        n_components: int = self.n_components

        # Initialize mu and sigma
        self.mu = np.min(X_arr) + (np.max(X_arr) - np.min(X_arr)) / (
            n_components + 1
        ) * np.array(range(1, n_components + 1))
        self.sigma = np.ones(n_components) * np.var(X_arr)

        if self.external_model is not None:
            d0: np.ndarray = self.external_model.pdf(X_arr)
            _has_external: int = 1
        else:
            _has_external = 0

        self.weights = np.ones(n_components + _has_external) / (
            n_components + _has_external
        )

        post_prob: np.ndarray = np.zeros((n_components + _has_external, X_arr.shape[0]))

        for __iter in range(self.max_iter):
            dens: np.ndarray = np.zeros((n_components + _has_external, X_arr.shape[0]))

            if self.external_model is not None:
                dens[-1, :] = d0

            for i in range(0, n_components):
                if i == 0:
                    dens[i, :] = self.main_pdf(X_arr, u=self.mu[i], sigma=self.sigma[i])
                else:
                    dens[i, :] = self.helper_pdf(
                        X_arr, u=self.mu[i], sigma=self.sigma[i]
                    )

            total_prob: np.ndarray = np.dot(self.weights, dens)
            total_prob[total_prob <= 0] = Div_Zero  # Avoid division by zero

            for i in range(n_components + _has_external):
                post_prob[i, :] = self.weights[i] * dens[i, :] / total_prob

            sum_prob: np.ndarray = np.sum(post_prob, axis=1)
            sum_prob[sum_prob == 0] = 1e-12

            new_weights: np.ndarray = sum_prob / np.sum(sum_prob)

            new_mu: np.ndarray = (
                np.dot(post_prob[: self.mu.shape[0], :], X_arr)
                / sum_prob[: self.mu.shape[0]]
            )
            new_sigma: np.ndarray = self.sigma.copy()
            for i in range(n_components):
                new_sigma[i] = np.sqrt(
                    np.dot(post_prob[i, :], np.square(X_arr - self.mu[i])) / sum_prob[i]
                )

            if (
                np.any(np.isnan(new_sigma))
                or np.any(np.isnan(new_mu))
                or np.any(np.isinf(new_sigma))
                or np.any(np.isinf(new_mu))
                or np.any(np.array(new_sigma) <= Div_Zero)
                or np.any(np.array(new_mu) <= Div_Zero)
            ):
                break

            self.weights = new_weights
            self.mu = new_mu
            self.sigma = new_sigma





[docs]
class DecoyModel(TDA_fmm):
    """A simplified model for fitting decoy score distributions.

    Uses a single Gaussian, optionally filtering outliers using sigma threshold.
    """


[docs]
    def __init__(
        self, gaussian_outlier_sigma: float | None, *args: Any, **kwargs: Any
    ) -> None:
        """Initializes the decoy model.

        Args:
            gaussian_outlier_sigma: If provided, scores below (mu - sigma * gaussian_outlier_sigma)
                                    are filtered before fitting.
            *args, **kwargs: Ignored, for compatibility.
        """
        super().__init__(n_components=1)
        self.external_model: TDA_fmm | None = None
        self.weights: np.ndarray = np.array([1.0])
        self.gaussian_outlier_sigma: float | None = gaussian_outlier_sigma
        self.mu: float | None = None
        self.sigma: float | None = None



[docs]
    def fit(self, X: np.ndarray | list[float]) -> None:
        """Fits a single Gaussian to the decoy scores.

        Optionally filters left-tail outliers before fitting.

        Args:
            X: Decoy scores.
        """
        X_arr: np.ndarray = np.array(X)
        self.mu = float(np.mean(X_arr))
        self.sigma = float(np.std(X_arr))

        if self.gaussian_outlier_sigma is not None:
            threshold: float = self.mu - self.gaussian_outlier_sigma * self.sigma
            X_filtered: np.ndarray = X_arr[X_arr > threshold]
            if len(X_filtered) > 0:
                self.mu = float(np.mean(X_filtered))
                self.sigma = float(np.std(X_filtered))



[docs]
    def pdf(self, X: np.ndarray | list[float]) -> np.ndarray:
        """Computes Gaussian PDF for given scores.

        Args:
            X: Input scores.

        Returns:
            PDF values.
        """
        X_arr: np.ndarray = np.array(X)
        return gauss_pdf(X_arr, self.mu, self.sigma)





[docs]
def select_best_fmm(
    target_scores: np.ndarray | list[float],
    decoy_fmm: "DecoyModel",
    _max_component_: int = 3,
    verbose: bool = True,
) -> TDA_fmm:
    """Selects the best TDA_fmm model by BIC criterion.

    Fits models with 1 to `_max_component_` components and selects the one with lowest BIC.

    Args:
        target_scores: Scores to fit the target model on.
        decoy_fmm: Pre-fitted decoy model.
        _max_component_: Maximum number of components to try.
        verbose: Whether to print progress.

    Returns:
        Best-fitted TDA_fmm model (target model).
    """
    best_target_BIC: float = BIC_Max
    best_fmm: TDA_fmm | None = None
    for n_component in range(1, _max_component_ + 1):
        target_fmm = TDA_fmm(n_component, external_model=decoy_fmm)
        target_fmm.fit(target_scores)
        log_lik, BIC = target_fmm.loglik_BIC(target_scores)
        if best_target_BIC > BIC:
            best_target_BIC = BIC
            best_fmm = target_fmm
        if verbose:
            print(
                f"[Target FMM] G={n_component:d}, BIC={BIC:f}, best BIC={best_target_BIC:f}"
            )
    assert best_fmm is not None, "No valid model was selected"
    return best_fmm