Source code for fennomix_mhc.mhc_utils

"""Utility helpers for handling peptide and protein sequences."""

from __future__ import annotations

import os
from collections.abc import Sequence

import numpy as np
import pandas as pd
from alphabase.protein.fasta import load_all_proteins
from alphabase.protein.lcp_digest import get_substring_indices


[docs] def load_peptide_df_from_mixmhcpred(mixmhcpred_dir: str, rank: int = 2) -> pd.DataFrame: """Load and filter peptide predictions from MixMHCpred output files. This function reads all TSV result files from a directory generated by MixMHCpred, filters peptides based on the '%Rank_bestAllele' column, and returns a unified DataFrame containing the peptide sequences and their corresponding alleles. Args: mixmhcpred_dir: Path to the directory containing MixMHCpred output `.tsv` files. rank: Maximum allowed value for `%Rank_bestAllele`. Only peptides with rank less than or equal to this value are included. Default is 2. Returns: A DataFrame with two columns: - 'sequence': The peptide amino acid sequence. - 'allele': The corresponding MHC allele name derived from the filename. Raises: FileNotFoundError: If the specified directory does not exist or contains no files. pd.errors.EmptyDataError: If no valid data is found in the TSV files. """ df_list: list[pd.DataFrame] = [] for fname in os.listdir(mixmhcpred_dir): fpath: str = os.path.join(mixmhcpred_dir, fname) df: pd.DataFrame = pd.read_table(fpath, skiprows=11) df = df.query(f"`%Rank_bestAllele`<={rank}").copy() df["sequence"] = df["Peptide"] df = df[["sequence"]] df["allele"] = fname[:-4] df_list.append(df) return pd.concat(df_list, ignore_index=True)
[docs] class NonSpecificDigest: """Generate peptides from a protein sequence without specific cleavage."""
[docs] def __init__( self, protein_data: pd.DataFrame | str | list[str], min_peptide_len: int = 8, max_peptide_len: int = 14, ) -> None: """Initialize the digestion object with protein data. Concatenates all protein sequences with '$' as delimiters and precomputes start and stop indices for all possible peptides within the specified length range. Args: protein_data: Input protein data, which can be one of the following: - A pandas DataFrame with a 'sequence' column. - A path to a FASTA file. - A list of paths to FASTA files. min_peptide_len: Minimum length of peptides to generate. Must be >= 1. Default is 8. max_peptide_len: Maximum length of peptides to generate. Must be >= min_peptide_len. Default is 14. Raises: ValueError: If `min_peptide_len` > `max_peptide_len`, or if no sequences are found. TypeError: If `protein_data` is not a DataFrame, string, or list of strings. """ if not isinstance(protein_data, (pd.DataFrame | str | list)): raise TypeError( "protein_data must be a DataFrame, string, or list of strings" ) if isinstance(protein_data, pd.DataFrame): if "sequence" not in protein_data.columns: raise ValueError("DataFrame must contain a 'sequence' column") self.cat_protein_sequence: str = ( "$" + "$".join(protein_data.sequence.values) + "$" ) else: if isinstance(protein_data, str): protein_paths: list[str] = [protein_data] else: protein_paths = protein_data protein_dict: dict[str, dict[str, str]] = load_all_proteins(protein_paths) if not protein_dict: raise ValueError( "No protein sequences found in the provided FASTA files." ) self.cat_protein_sequence = ( "$" + "$".join([prot["sequence"] for prot in protein_dict.values()]) + "$" ) self.digest_starts: np.ndarray self.digest_stops: np.ndarray self.digest_starts, self.digest_stops = get_substring_indices( self.cat_protein_sequence, min_peptide_len, max_peptide_len )
[docs] def get_random_pept_df(self, n: int = 5000) -> pd.DataFrame: """Sample a random set of peptides from the precomputed digest. Args: n: Number of peptides to sample. If `n` exceeds the number of available peptides, sampling is done with replacement. Default is 5000. Returns: A DataFrame with two columns: - 'sequence': Randomly sampled peptide sequences. - 'allele': A constant value 'random' for all rows. """ total_peptides: int = len(self.digest_starts) if total_peptides == 0: return pd.DataFrame(columns=["sequence", "allele"]) idxes: np.ndarray = np.random.randint(0, total_peptides, size=n) sequences: list[str] = [ self.cat_protein_sequence[start:stop] for start, stop in zip( self.digest_starts[idxes], self.digest_stops[idxes], strict=False ) ] df: pd.DataFrame = pd.DataFrame(sequences, columns=["sequence"]) df["allele"] = "random" return df
[docs] def get_peptide_seqs_from_idxes( self, idxes: Sequence[int] | np.ndarray ) -> list[str]: """Retrieve peptide sequences by their digestion index. Args: idxes: A sequence (e.g., list, tuple) or NumPy array of integer indices corresponding to positions in the precomputed digest. Returns: A list of peptide sequences corresponding to the given indices. Raises: IndexError: If any index in `idxes` is out of bounds. """ try: return [ self.cat_protein_sequence[start:stop] for start, stop in zip( self.digest_starts[idxes], self.digest_stops[idxes], strict=False ) ] except IndexError as e: raise IndexError(f"One or more indices are out of bounds: {e}") from e