Source code for fennomix_mhc.mhc_utils

"""Utility helpers for handling peptide and protein sequences."""

from __future__ import annotations

import os
from collections.abc import Sequence

import numpy as np
import pandas as pd
from alphabase.protein.fasta import load_all_proteins
from alphabase.protein.lcp_digest import get_substring_indices



[docs]
def load_peptide_df_from_mixmhcpred(mixmhcpred_dir: str, rank: int = 2) -> pd.DataFrame:
    """Load and filter peptide predictions from MixMHCpred output files.

    This function reads all TSV result files from a directory generated by MixMHCpred,
    filters peptides based on the '%Rank_bestAllele' column, and returns a unified
    DataFrame containing the peptide sequences and their corresponding alleles.

    Args:
        mixmhcpred_dir: Path to the directory containing MixMHCpred output `.tsv` files.
        rank: Maximum allowed value for `%Rank_bestAllele`. Only peptides with rank
            less than or equal to this value are included. Default is 2.

    Returns:
        A DataFrame with two columns:
        - 'sequence': The peptide amino acid sequence.
        - 'allele': The corresponding MHC allele name derived from the filename.

    Raises:
        FileNotFoundError: If the specified directory does not exist or contains no files.
        pd.errors.EmptyDataError: If no valid data is found in the TSV files.
    """
    df_list: list[pd.DataFrame] = []
    for fname in os.listdir(mixmhcpred_dir):
        fpath: str = os.path.join(mixmhcpred_dir, fname)
        df: pd.DataFrame = pd.read_table(fpath, skiprows=11)
        df = df.query(f"`%Rank_bestAllele`<={rank}").copy()
        df["sequence"] = df["Peptide"]
        df = df[["sequence"]]
        df["allele"] = fname[:-4]
        df_list.append(df)
    return pd.concat(df_list, ignore_index=True)




[docs]
class NonSpecificDigest:
    """Generate peptides from a protein sequence without specific cleavage."""


[docs]
    def __init__(
        self,
        protein_data: pd.DataFrame | str | list[str],
        min_peptide_len: int = 8,
        max_peptide_len: int = 14,
    ) -> None:
        """Initialize the digestion object with protein data.

        Concatenates all protein sequences with '$' as delimiters and precomputes
        start and stop indices for all possible peptides within the specified length range.

        Args:
            protein_data: Input protein data, which can be one of the following:
                - A pandas DataFrame with a 'sequence' column.
                - A path to a FASTA file.
                - A list of paths to FASTA files.
            min_peptide_len: Minimum length of peptides to generate. Must be >= 1.
                Default is 8.
            max_peptide_len: Maximum length of peptides to generate. Must be >= min_peptide_len.
                Default is 14.

        Raises:
            ValueError: If `min_peptide_len` > `max_peptide_len`, or if no sequences are found.
            TypeError: If `protein_data` is not a DataFrame, string, or list of strings.
        """
        if not isinstance(protein_data, (pd.DataFrame | str | list)):
            raise TypeError(
                "protein_data must be a DataFrame, string, or list of strings"
            )

        if isinstance(protein_data, pd.DataFrame):
            if "sequence" not in protein_data.columns:
                raise ValueError("DataFrame must contain a 'sequence' column")
            self.cat_protein_sequence: str = (
                "$" + "$".join(protein_data.sequence.values) + "$"
            )
        else:
            if isinstance(protein_data, str):
                protein_paths: list[str] = [protein_data]
            else:
                protein_paths = protein_data
            protein_dict: dict[str, dict[str, str]] = load_all_proteins(protein_paths)
            if not protein_dict:
                raise ValueError(
                    "No protein sequences found in the provided FASTA files."
                )
            self.cat_protein_sequence = (
                "$"
                + "$".join([prot["sequence"] for prot in protein_dict.values()])
                + "$"
            )

        self.digest_starts: np.ndarray
        self.digest_stops: np.ndarray
        self.digest_starts, self.digest_stops = get_substring_indices(
            self.cat_protein_sequence, min_peptide_len, max_peptide_len
        )



[docs]
    def get_random_pept_df(self, n: int = 5000) -> pd.DataFrame:
        """Sample a random set of peptides from the precomputed digest.

        Args:
            n: Number of peptides to sample. If `n` exceeds the number of available
                peptides, sampling is done with replacement. Default is 5000.

        Returns:
            A DataFrame with two columns:
            - 'sequence': Randomly sampled peptide sequences.
            - 'allele': A constant value 'random' for all rows.
        """
        total_peptides: int = len(self.digest_starts)
        if total_peptides == 0:
            return pd.DataFrame(columns=["sequence", "allele"])

        idxes: np.ndarray = np.random.randint(0, total_peptides, size=n)
        sequences: list[str] = [
            self.cat_protein_sequence[start:stop]
            for start, stop in zip(
                self.digest_starts[idxes], self.digest_stops[idxes], strict=False
            )
        ]
        df: pd.DataFrame = pd.DataFrame(sequences, columns=["sequence"])
        df["allele"] = "random"
        return df



[docs]
    def get_peptide_seqs_from_idxes(
        self, idxes: Sequence[int] | np.ndarray
    ) -> list[str]:
        """Retrieve peptide sequences by their digestion index.

        Args:
            idxes: A sequence (e.g., list, tuple) or NumPy array of integer indices
                corresponding to positions in the precomputed digest.

        Returns:
            A list of peptide sequences corresponding to the given indices.

        Raises:
            IndexError: If any index in `idxes` is out of bounds.
        """
        try:
            return [
                self.cat_protein_sequence[start:stop]
                for start, stop in zip(
                    self.digest_starts[idxes], self.digest_stops[idxes], strict=False
                )
            ]
        except IndexError as e:
            raise IndexError(f"One or more indices are out of bounds: {e}") from e