Source code for fennomix_mhc.tda_fmm

from typing import Any, Optional

import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gamma as dgamma

# Constants
BIC_Max: float = 1e100
Div_Zero: float = 1e-50
Max_Iter: int = 30


[docs] def gauss_pdf( X: np.ndarray | list[float] | float, u: float, sigma: float ) -> np.ndarray: """Calculate Gaussian probability density function values for input array X. Args: X: Input array of shape (n,) representing scores. u: Mean (mu) of the Gaussian distribution. sigma: Standard deviation (sigma) of the Gaussian distribution. Returns: Array of PDF values with same shape as X. """ return 1 / (np.sqrt(2 * np.pi) * sigma) * np.exp(-0.5 * np.square((X - u) / sigma))
[docs] def gamma_pdf( X: np.ndarray | list[float] | float, u: float, sigma: float ) -> np.ndarray: """Calculate Gamma probability density function values using mean and std. The shape and scale parameters are derived from mean (u) and std (sigma). Args: X: Input array of shape (n,) representing scores. u: Mean of the distribution. sigma: Standard deviation of the distribution. Returns: Array of Gamma PDF values with same shape as X. """ a = (u / sigma) ** 2 scale = sigma**2 / u return dgamma.pdf(X, a=a, scale=scale)
[docs] class TDA_fmm: """Finite Mixture Model (FMM) for Target-Decoy Analysis (TDA). This class estimates score distributions using a mixture of Gaussians. It supports modeling both target and decoy distributions, where the decoy model can be incorporated as an external component in the target model. Attributes: n_components: Number of Gaussian components in the mixture. external_model: Optional fitted decoy model (for target modeling). max_iter: Maximum number of EM iterations. main_pdf: PDF function used for the first component. helper_pdf: PDF function used for other components. weights: Learned mixture weights (pi_k). mu: Learned means for each component. sigma: Learned standard deviations for each component. """
[docs] def __init__( self, n_components: int, external_model: Optional["TDA_fmm"] = None ) -> None: """Initializes the TDA_fmm model. Args: n_components: Number of Gaussian components in the mixture. external_model: Pre-fitted decoy model. If None, models decoy; if provided, models target with decoy as a component. """ self.n_components: int = n_components self.external_model: "TDA_fmm" | None = external_model self.max_iter: int = Max_Iter if external_model is None: self.main_pdf: Any = ( gauss_pdf # Callable[[ArrayLike, float, float], np.ndarray] ) else: self.main_pdf: Any = gauss_pdf self.helper_pdf: Any = gauss_pdf # Initialize parameters; set to None until fit() self.weights: np.ndarray | None = None self.mu: np.ndarray | None = None self.sigma: np.ndarray | None = None
def __call__(self, X: np.ndarray | list[float]) -> np.ndarray: return self.pdf_mix(X)
[docs] def pep( self, X: np.ndarray | list[float], external_pdf: np.ndarray | None = None, ) -> np.ndarray: """Estimates Posterior Error Probabilities (PEP). PEP = pi0 * f_decoy(x) / f_mixture(x) Args: X: Input scores. external_pdf: Optional precomputed PDF values from external model. If None and external_model exists, it will be computed. Returns: Array of PEP values for each score in X. Returns zeros if model not fitted. """ if self.weights is None or self.external_model is None: return np.zeros(len(X)) if external_pdf is None: external_pdf = self.external_model.pdf(X) return self.weights[-1] * external_pdf / self.pdf_mix(X, external_pdf)
[docs] def get_pi0(self) -> float: """Returns the estimated proportion of decoy (null) components in the mixture. Returns: pi0 value (between 0 and 1). Returns 0 if model not fitted or no external model. """ if self.weights is None or self.external_model is None: return 0.0 else: return float(self.weights[-1])
[docs] def pdf(self, X: np.ndarray | list[float]) -> np.ndarray: """Computes the PDF of the main mixture components (excluding external model). Args: X: Input scores of shape (n,). Returns: PDF values of shape (n,). Returns zeros if model not fitted. """ if self.weights is None: return np.zeros(len(X)) X_arr: np.ndarray = np.array(X) n_samples: int = X_arr.shape[0] pdf: np.ndarray = np.zeros((self.n_components, n_samples)) for i in range(0, self.n_components): if i == 0: pdf[i, :] = self.main_pdf(X_arr, u=self.mu[i], sigma=self.sigma[i]) else: pdf[i, :] = self.helper_pdf(X_arr, u=self.mu[i], sigma=self.sigma[i]) return np.dot(self.weights[: self.n_components], pdf)
[docs] def pdf_mix( self, X: np.ndarray | list[float], external_pdf: np.ndarray | None = None, ) -> np.ndarray: """Computes the full mixture PDF, including external model if present. f_mixture(x) = pi0 * f_decoy(x) + (1-pi0) * f_target(x) Args: X: Input scores. external_pdf: Optional precomputed PDF values from external model. Returns: Mixture PDF values. Returns zeros if model not fitted. """ if self.weights is None: return np.zeros(len(X)) X_arr: np.ndarray = np.array(X) if self.external_model is not None: if external_pdf is None: external_pdf = self.external_model.pdf(X_arr) pdf0: np.ndarray = external_pdf * self.get_pi0() pdf1: np.ndarray = self.pdf(X_arr) * (1 - self.get_pi0()) return pdf0 + pdf1 else: return self.pdf(X_arr)
[docs] def loglik_BIC(self, X: np.ndarray | list[float]) -> tuple[float, float]: """Computes log-likelihood and Bayesian Information Criterion (BIC). BIC = -2 * loglik + num_params * log(n) Args: X: Input scores. Returns: A tuple of (log-likelihood, BIC). Returns (0, 0) if model not fitted. """ if self.weights is None: return 0.0, 0.0 _has_external: int = 1 if self.external_model is not None else 0 loglik: float = float(np.sum(np.log(self.pdf(X)))) n: int = len(X) BIC: float = -2 * loglik + (self.n_components + _has_external) * np.log(n) return loglik, BIC
[docs] def plot( self, title: str, plot_scores: np.ndarray | list[float], false_scores: np.ndarray | list[float] | None = None, ) -> None: """Plots the fitted mixture model against histogram of scores. If an external model exists and false_scores are provided, plots: - Decoy model (external) - Target histogram + mixture fit - Separated true and false components Otherwise, plots only the decoy model fit. Args: title: Title prefix for plots. plot_scores: Scores to plot (e.g., target scores). false_scores: Optional decoy scores for comparison. """ binsize: int = 40 lw: float = 1.0 if self.external_model is not None and false_scores is not None: self.external_model.plot(title, false_scores) scores: np.ndarray = np.array(plot_scores) binsize *= 2 step: float = 0.001 plt.figure() pp = plt.subplot() n, bins, patches = pp.hist( scores, binsize, density=1, facecolor="green", alpha=0.75 ) bins_arr: np.ndarray = np.arange(scores.min(), scores.max(), step) dis: np.ndarray = self.pdf_mix(bins_arr) ratio: float = np.mean(n) / np.mean(dis) dis = dis * ratio pp.plot(bins_arr, dis, "r--", linewidth=lw) pp.set_title(title + " target") # plot false and true distribution separately plt.figure() pp = plt.subplot() dis_false: np.ndarray = self.external_model.pdf(bins_arr) dis_true: np.ndarray = self.pdf(bins_arr) pp.hist(scores, binsize, density=1, facecolor="green", alpha=0.75) dis_false = dis_false * ratio * self.get_pi0() dis_true = dis_true * ratio * (1 - self.get_pi0()) pp.plot(bins_arr, dis_true, "b--", linewidth=lw) pp.plot(bins_arr, dis_false, "r--", linewidth=lw) pp.set_title( title + rf" target mixture ($\pi_0$={self.get_pi0():.2f}, $\pi_1$={1 - self.get_pi0():.2f})" ) else: scores: np.ndarray = np.array(plot_scores) step: float = 0.1 plt.figure() pp = plt.subplot() n, bins, patches = pp.hist( scores, binsize, density=1, facecolor="blue", alpha=0.75 ) bins_arr: np.ndarray = np.arange(scores.min(), scores.max(), step) dis: np.ndarray = self.pdf(bins_arr) dis = dis * np.mean(n) / np.mean(dis) pp.plot(bins_arr, dis, "r--", linewidth=lw) pp.set_title(title + " decoy")
[docs] def fit(self, X: np.ndarray | list[float]) -> None: """Fits the FMM model using Expectation-Maximization (EM) algorithm. Args: X: Input scores to fit the model on. """ if len(X) < 10: self.weights = None return X_arr: np.ndarray = np.array(X) n_components: int = self.n_components # Initialize mu and sigma self.mu = np.min(X_arr) + (np.max(X_arr) - np.min(X_arr)) / ( n_components + 1 ) * np.array(range(1, n_components + 1)) self.sigma = np.ones(n_components) * np.var(X_arr) if self.external_model is not None: d0: np.ndarray = self.external_model.pdf(X_arr) _has_external: int = 1 else: _has_external = 0 self.weights = np.ones(n_components + _has_external) / ( n_components + _has_external ) post_prob: np.ndarray = np.zeros((n_components + _has_external, X_arr.shape[0])) for __iter in range(self.max_iter): dens: np.ndarray = np.zeros((n_components + _has_external, X_arr.shape[0])) if self.external_model is not None: dens[-1, :] = d0 for i in range(0, n_components): if i == 0: dens[i, :] = self.main_pdf(X_arr, u=self.mu[i], sigma=self.sigma[i]) else: dens[i, :] = self.helper_pdf( X_arr, u=self.mu[i], sigma=self.sigma[i] ) total_prob: np.ndarray = np.dot(self.weights, dens) total_prob[total_prob <= 0] = Div_Zero # Avoid division by zero for i in range(n_components + _has_external): post_prob[i, :] = self.weights[i] * dens[i, :] / total_prob sum_prob: np.ndarray = np.sum(post_prob, axis=1) sum_prob[sum_prob == 0] = 1e-12 new_weights: np.ndarray = sum_prob / np.sum(sum_prob) new_mu: np.ndarray = ( np.dot(post_prob[: self.mu.shape[0], :], X_arr) / sum_prob[: self.mu.shape[0]] ) new_sigma: np.ndarray = self.sigma.copy() for i in range(n_components): new_sigma[i] = np.sqrt( np.dot(post_prob[i, :], np.square(X_arr - self.mu[i])) / sum_prob[i] ) if ( np.any(np.isnan(new_sigma)) or np.any(np.isnan(new_mu)) or np.any(np.isinf(new_sigma)) or np.any(np.isinf(new_mu)) or np.any(np.array(new_sigma) <= Div_Zero) or np.any(np.array(new_mu) <= Div_Zero) ): break self.weights = new_weights self.mu = new_mu self.sigma = new_sigma
[docs] class DecoyModel(TDA_fmm): """A simplified model for fitting decoy score distributions. Uses a single Gaussian, optionally filtering outliers using sigma threshold. """
[docs] def __init__( self, gaussian_outlier_sigma: float | None, *args: Any, **kwargs: Any ) -> None: """Initializes the decoy model. Args: gaussian_outlier_sigma: If provided, scores below (mu - sigma * gaussian_outlier_sigma) are filtered before fitting. *args, **kwargs: Ignored, for compatibility. """ super().__init__(n_components=1) self.external_model: TDA_fmm | None = None self.weights: np.ndarray = np.array([1.0]) self.gaussian_outlier_sigma: float | None = gaussian_outlier_sigma self.mu: float | None = None self.sigma: float | None = None
[docs] def fit(self, X: np.ndarray | list[float]) -> None: """Fits a single Gaussian to the decoy scores. Optionally filters left-tail outliers before fitting. Args: X: Decoy scores. """ X_arr: np.ndarray = np.array(X) self.mu = float(np.mean(X_arr)) self.sigma = float(np.std(X_arr)) if self.gaussian_outlier_sigma is not None: threshold: float = self.mu - self.gaussian_outlier_sigma * self.sigma X_filtered: np.ndarray = X_arr[X_arr > threshold] if len(X_filtered) > 0: self.mu = float(np.mean(X_filtered)) self.sigma = float(np.std(X_filtered))
[docs] def pdf(self, X: np.ndarray | list[float]) -> np.ndarray: """Computes Gaussian PDF for given scores. Args: X: Input scores. Returns: PDF values. """ X_arr: np.ndarray = np.array(X) return gauss_pdf(X_arr, self.mu, self.sigma)
[docs] def select_best_fmm( target_scores: np.ndarray | list[float], decoy_fmm: "DecoyModel", _max_component_: int = 3, verbose: bool = True, ) -> TDA_fmm: """Selects the best TDA_fmm model by BIC criterion. Fits models with 1 to `_max_component_` components and selects the one with lowest BIC. Args: target_scores: Scores to fit the target model on. decoy_fmm: Pre-fitted decoy model. _max_component_: Maximum number of components to try. verbose: Whether to print progress. Returns: Best-fitted TDA_fmm model (target model). """ best_target_BIC: float = BIC_Max best_fmm: TDA_fmm | None = None for n_component in range(1, _max_component_ + 1): target_fmm = TDA_fmm(n_component, external_model=decoy_fmm) target_fmm.fit(target_scores) log_lik, BIC = target_fmm.loglik_BIC(target_scores) if best_target_BIC > BIC: best_target_BIC = BIC best_fmm = target_fmm if verbose: print( f"[Target FMM] G={n_component:d}, BIC={BIC:f}, best BIC={best_target_BIC:f}" ) assert best_fmm is not None, "No valid model was selected" return best_fmm