Source code for fennomix_mhc.mhc_utils
"""Utility helpers for handling peptide and protein sequences."""
from __future__ import annotations
import os
from collections.abc import Sequence
import numpy as np
import pandas as pd
from alphabase.protein.fasta import load_all_proteins
from alphabase.protein.lcp_digest import get_substring_indices
[docs]
def load_peptide_df_from_mixmhcpred(mixmhcpred_dir: str, rank: int = 2) -> pd.DataFrame:
"""Load and filter peptide predictions from MixMHCpred output files.
This function reads all TSV result files from a directory generated by MixMHCpred,
filters peptides based on the '%Rank_bestAllele' column, and returns a unified
DataFrame containing the peptide sequences and their corresponding alleles.
Args:
mixmhcpred_dir: Path to the directory containing MixMHCpred output `.tsv` files.
rank: Maximum allowed value for `%Rank_bestAllele`. Only peptides with rank
less than or equal to this value are included. Default is 2.
Returns:
A DataFrame with two columns:
- 'sequence': The peptide amino acid sequence.
- 'allele': The corresponding MHC allele name derived from the filename.
Raises:
FileNotFoundError: If the specified directory does not exist or contains no files.
pd.errors.EmptyDataError: If no valid data is found in the TSV files.
"""
df_list: list[pd.DataFrame] = []
for fname in os.listdir(mixmhcpred_dir):
fpath: str = os.path.join(mixmhcpred_dir, fname)
df: pd.DataFrame = pd.read_table(fpath, skiprows=11)
df = df.query(f"`%Rank_bestAllele`<={rank}").copy()
df["sequence"] = df["Peptide"]
df = df[["sequence"]]
df["allele"] = fname[:-4]
df_list.append(df)
return pd.concat(df_list, ignore_index=True)
[docs]
class NonSpecificDigest:
"""Generate peptides from a protein sequence without specific cleavage."""
[docs]
def __init__(
self,
protein_data: pd.DataFrame | str | list[str],
min_peptide_len: int = 8,
max_peptide_len: int = 14,
) -> None:
"""Initialize the digestion object with protein data.
Concatenates all protein sequences with '$' as delimiters and precomputes
start and stop indices for all possible peptides within the specified length range.
Args:
protein_data: Input protein data, which can be one of the following:
- A pandas DataFrame with a 'sequence' column.
- A path to a FASTA file.
- A list of paths to FASTA files.
min_peptide_len: Minimum length of peptides to generate. Must be >= 1.
Default is 8.
max_peptide_len: Maximum length of peptides to generate. Must be >= min_peptide_len.
Default is 14.
Raises:
ValueError: If `min_peptide_len` > `max_peptide_len`, or if no sequences are found.
TypeError: If `protein_data` is not a DataFrame, string, or list of strings.
"""
if not isinstance(protein_data, (pd.DataFrame | str | list)):
raise TypeError(
"protein_data must be a DataFrame, string, or list of strings"
)
if isinstance(protein_data, pd.DataFrame):
if "sequence" not in protein_data.columns:
raise ValueError("DataFrame must contain a 'sequence' column")
self.cat_protein_sequence: str = (
"$" + "$".join(protein_data.sequence.values) + "$"
)
else:
if isinstance(protein_data, str):
protein_paths: list[str] = [protein_data]
else:
protein_paths = protein_data
protein_dict: dict[str, dict[str, str]] = load_all_proteins(protein_paths)
if not protein_dict:
raise ValueError(
"No protein sequences found in the provided FASTA files."
)
self.cat_protein_sequence = (
"$"
+ "$".join([prot["sequence"] for prot in protein_dict.values()])
+ "$"
)
self.digest_starts: np.ndarray
self.digest_stops: np.ndarray
self.digest_starts, self.digest_stops = get_substring_indices(
self.cat_protein_sequence, min_peptide_len, max_peptide_len
)
[docs]
def get_random_pept_df(self, n: int = 5000) -> pd.DataFrame:
"""Sample a random set of peptides from the precomputed digest.
Args:
n: Number of peptides to sample. If `n` exceeds the number of available
peptides, sampling is done with replacement. Default is 5000.
Returns:
A DataFrame with two columns:
- 'sequence': Randomly sampled peptide sequences.
- 'allele': A constant value 'random' for all rows.
"""
total_peptides: int = len(self.digest_starts)
if total_peptides == 0:
return pd.DataFrame(columns=["sequence", "allele"])
idxes: np.ndarray = np.random.randint(0, total_peptides, size=n)
sequences: list[str] = [
self.cat_protein_sequence[start:stop]
for start, stop in zip(
self.digest_starts[idxes], self.digest_stops[idxes], strict=False
)
]
df: pd.DataFrame = pd.DataFrame(sequences, columns=["sequence"])
df["allele"] = "random"
return df
[docs]
def get_peptide_seqs_from_idxes(
self, idxes: Sequence[int] | np.ndarray
) -> list[str]:
"""Retrieve peptide sequences by their digestion index.
Args:
idxes: A sequence (e.g., list, tuple) or NumPy array of integer indices
corresponding to positions in the precomputed digest.
Returns:
A list of peptide sequences corresponding to the given indices.
Raises:
IndexError: If any index in `idxes` is out of bounds.
"""
try:
return [
self.cat_protein_sequence[start:stop]
for start, stop in zip(
self.digest_starts[idxes], self.digest_stops[idxes], strict=False
)
]
except IndexError as e:
raise IndexError(f"One or more indices are out of bounds: {e}") from e