Source code for fennomix_mhc.plotting_utils

import logomaker as lm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import umap
from sklearn.manifold import MDS, TSNE

Turbo256 = (
    "#30123b",
    "#311542",
    "#32184a",
    "#341b51",
    "#351e58",
    "#36215f",
    "#372365",
    "#38266c",
    "#392972",
    "#3a2c79",
    "#3b2f7f",
    "#3c3285",
    "#3c358b",
    "#3d3791",
    "#3e3a96",
    "#3f3d9c",
    "#4040a1",
    "#4043a6",
    "#4145ab",
    "#4148b0",
    "#424bb5",
    "#434eba",
    "#4350be",
    "#4353c2",
    "#4456c7",
    "#4458cb",
    "#455bce",
    "#455ed2",
    "#4560d6",
    "#4563d9",
    "#4666dd",
    "#4668e0",
    "#466be3",
    "#466de6",
    "#4670e8",
    "#4673eb",
    "#4675ed",
    "#4678f0",
    "#467af2",
    "#467df4",
    "#467ff6",
    "#4682f8",
    "#4584f9",
    "#4587fb",
    "#4589fc",
    "#448cfd",
    "#438efd",
    "#4291fe",
    "#4193fe",
    "#4096fe",
    "#3f98fe",
    "#3e9bfe",
    "#3c9dfd",
    "#3ba0fc",
    "#39a2fc",
    "#38a5fb",
    "#36a8f9",
    "#34aaf8",
    "#33acf6",
    "#31aff5",
    "#2fb1f3",
    "#2db4f1",
    "#2bb6ef",
    "#2ab9ed",
    "#28bbeb",
    "#26bde9",
    "#25c0e6",
    "#23c2e4",
    "#21c4e1",
    "#20c6df",
    "#1ec9dc",
    "#1dcbda",
    "#1ccdd7",
    "#1bcfd4",
    "#1ad1d2",
    "#19d3cf",
    "#18d5cc",
    "#18d7ca",
    "#17d9c7",
    "#17dac4",
    "#17dcc2",
    "#17debf",
    "#18e0bd",
    "#18e1ba",
    "#19e3b8",
    "#1ae4b6",
    "#1be5b4",
    "#1de7b1",
    "#1ee8af",
    "#20e9ac",
    "#22eba9",
    "#24eca6",
    "#27eda3",
    "#29eea0",
    "#2cef9d",
    "#2ff09a",
    "#32f197",
    "#35f394",
    "#38f491",
    "#3bf48d",
    "#3ff58a",
    "#42f687",
    "#46f783",
    "#4af880",
    "#4df97c",
    "#51f979",
    "#55fa76",
    "#59fb72",
    "#5dfb6f",
    "#61fc6c",
    "#65fc68",
    "#69fd65",
    "#6dfd62",
    "#71fd5f",
    "#74fe5c",
    "#78fe59",
    "#7cfe56",
    "#80fe53",
    "#84fe50",
    "#87fe4d",
    "#8bfe4b",
    "#8efe48",
    "#92fe46",
    "#95fe44",
    "#98fe42",
    "#9bfd40",
    "#9efd3e",
    "#a1fc3d",
    "#a4fc3b",
    "#a6fb3a",
    "#a9fb39",
    "#acfa37",
    "#aef937",
    "#b1f836",
    "#b3f835",
    "#b6f735",
    "#b9f534",
    "#bbf434",
    "#bef334",
    "#c0f233",
    "#c3f133",
    "#c5ef33",
    "#c8ee33",
    "#caed33",
    "#cdeb34",
    "#cfea34",
    "#d1e834",
    "#d4e735",
    "#d6e535",
    "#d8e335",
    "#dae236",
    "#dde036",
    "#dfde36",
    "#e1dc37",
    "#e3da37",
    "#e5d838",
    "#e7d738",
    "#e8d538",
    "#ead339",
    "#ecd139",
    "#edcf39",
    "#efcd39",
    "#f0cb3a",
    "#f2c83a",
    "#f3c63a",
    "#f4c43a",
    "#f6c23a",
    "#f7c039",
    "#f8be39",
    "#f9bc39",
    "#f9ba38",
    "#fab737",
    "#fbb537",
    "#fbb336",
    "#fcb035",
    "#fcae34",
    "#fdab33",
    "#fda932",
    "#fda631",
    "#fda330",
    "#fea12f",
    "#fe9e2e",
    "#fe9b2d",
    "#fe982c",
    "#fd952b",
    "#fd9229",
    "#fd8f28",
    "#fd8c27",
    "#fc8926",
    "#fc8624",
    "#fb8323",
    "#fb8022",
    "#fa7d20",
    "#fa7a1f",
    "#f9771e",
    "#f8741c",
    "#f7711b",
    "#f76e1a",
    "#f66b18",
    "#f56817",
    "#f46516",
    "#f36315",
    "#f26014",
    "#f15d13",
    "#ef5a11",
    "#ee5810",
    "#ed550f",
    "#ec520e",
    "#ea500d",
    "#e94d0d",
    "#e84b0c",
    "#e6490b",
    "#e5460a",
    "#e3440a",
    "#e24209",
    "#e04008",
    "#de3e08",
    "#dd3c07",
    "#db3a07",
    "#d93806",
    "#d73606",
    "#d63405",
    "#d43205",
    "#d23005",
    "#d02f04",
    "#ce2d04",
    "#cb2b03",
    "#c92903",
    "#c72803",
    "#c52602",
    "#c32402",
    "#c02302",
    "#be2102",
    "#bb1f01",
    "#b91e01",
    "#b61c01",
    "#b41b01",
    "#b11901",
    "#ae1801",
    "#ac1601",
    "#a91501",
    "#a61401",
    "#a31201",
    "#a01101",
    "#9d1001",
    "#9a0e01",
    "#970d01",
    "#940c01",
    "#910b01",
    "#8e0a01",
    "#8b0901",
    "#870801",
    "#840701",
    "#810602",
    "#7d0502",
    "#7a0402",
)



[docs]
def fit_hla_umap_reducer(hla_embeds, random_state=1337):
    """Fits a UMAP reducer on HLA embeddings.

    Args:
        hla_embeds: A 2D numpy array of shape (n_samples, n_features) containing HLA embeddings.
        random_state: Random seed for reproducibility. Default is 1337.

    Returns:
        A fitted UMAP reducer object.
    """
    hla_reducer = umap.UMAP(random_state=random_state)
    hla_reducer.fit(hla_embeds)
    return hla_reducer




[docs]
def transform_embeds_to_umap_df(hla_reducer, embeds, alleles):
    """Transforms embeddings using a pre-fitted UMAP reducer into a 2D DataFrame.

    Args:
        hla_reducer: A fitted UMAP reducer object.
        embeds: A 2D numpy array of shape (n_samples, n_features) to be transformed.
        alleles: A sequence of allele names corresponding to each embedding.

    Returns:
        A pandas DataFrame with columns ['UMAP1', 'UMAP2', 'allele'].
    """
    embedding = hla_reducer.transform(embeds)
    df = pd.DataFrame(embedding, columns=("UMAP1", "UMAP2"))
    df["allele"] = alleles
    return df




[docs]
def transform_matrix_to_mds_df(matrix, labels, seed):
    """Applies MDS transformation on a precomputed distance matrix.

    Args:
        matrix: A 2D square distance matrix of shape (n_samples, n_samples).
        labels: A sequence of labels for each sample.
        seed: Random state for reproducibility.

    Returns:
        A pandas DataFrame with columns ['MDS1', 'MDS2', 'label'].
    """
    mds = MDS(
        n_components=2,
        random_state=seed,
        dissimilarity="precomputed",
        normalized_stress="auto",
    )
    embedding = mds.fit_transform(matrix)
    df = pd.DataFrame(embedding, columns=("MDS1", "MDS2"))
    df["label"] = labels
    return df




[docs]
def transform_embeds_to_tSNE_df(embeds, labels, seed):
    """Applies t-SNE transformation on high-dimensional embeddings.

    Args:
        embeds: A 2D numpy array of shape (n_samples, n_features).
        labels: A sequence of labels for each sample.
        seed: Random state for reproducibility.

    Returns:
        A pandas DataFrame with columns ['t-SNE 1', 't-SNE 2', 'label'].
    """
    tsne = TSNE(
        n_components=2, random_state=seed, perplexity=30, learning_rate=200, n_iter=1000
    )
    embedding = tsne.fit_transform(embeds)
    df = pd.DataFrame(embedding, columns=("t-SNE 1", "t-SNE 2"))
    df["label"] = labels
    return df




[docs]
def plot_umap_df(
    df: pd.DataFrame,
    color_col,
    hover_col,
    size=1,
    jump_color=True,
    image_width=700,
    image_height=600,
    save_as="",
):
    """Creates an interactive UMAP scatter plot using Plotly.

    Args:
        df: Input DataFrame containing UMAP1, UMAP2, and metadata columns.
        color_col: Column name to use for coloring points.
        hover_col: Column name to display in hover tooltip.
        size: Marker size.
        jump_color: Whether to use spaced-out colors from Turbo256 palette.
        image_width: Width of the output image in pixels.
        image_height: Height of the output image in pixels.
        save_as: Optional file path to save the image (e.g., "plot.png").

    Returns:
        A Plotly Figure object.
    """
    factors = df[color_col].drop_duplicates().to_list()
    color_mapping = {}
    for i, factor in enumerate(factors):
        if jump_color:
            color_mapping[factor] = Turbo256[i * 23 % len(Turbo256)]
        else:
            color_mapping[factor] = Turbo256[
                int(i * 1.0 * len(Turbo256) / len(factors))
            ]
    fig = px.scatter(
        df,
        x="UMAP1",
        y="UMAP2",
        color=color_col,
        hover_data=[hover_col],
        color_discrete_map=color_mapping,
        width=image_width,
        height=image_height,
        template="plotly_white",
    )
    fig.update_traces(marker=dict(size=size))
    fig.update_layout(
        legend=dict(
            itemsizing="constant",
            orientation="h",
            x=1,
            xanchor="right",
            yanchor="bottom",
            y=-1.0 / 11 * (len(factors) // 7 + 1),
        ),
        xaxis=dict(
            showgrid=False,
            visible=False,
            showticklabels=False,
        ),
        yaxis=dict(
            showgrid=False,
            visible=False,
            showticklabels=False,
        ),
    )
    if save_as:
        fig.write_image(save_as)
    # fig.show()
    return fig




[docs]
def plot_motif_multi_mer(
    df,
    allele_col,
    allele,
    kmers,
    axes=None,
    logo_scale=20,
    fig_width_per_kmer=4,
    fig_height=3,
):
    """Plots sequence logos for multiple k-mers (lengths) of a given allele.

    Args:
        df: Input DataFrame with 'sequence' and allele columns.
        allele_col: Name of the column containing allele identifiers.
        allele: Specific allele to plot.
        kmers: List of peptide lengths (k-mer sizes) to visualize.
        axes: Optional matplotlib axes to plot on. If None, creates new subplots.
        logo_scale: Scaling factor for information content in logos.
        fig_width_per_kmer: Width per subplot in inches.
        fig_height: Height of the entire figure in inches.

    Returns:
        A list of Logomaker Logo objects.
    """
    df["nAA"] = df.sequence.str.len()
    df = df.drop_duplicates(
        [allele_col, "sequence"]
        + (["mods", "mod_sites"] if "mods" in df.columns else [])
    )
    if axes is None:
        fig, axes = plt.subplots(
            1,
            len(kmers),
            figsize=(len(kmers) * fig_width_per_kmer, fig_height),
            sharey="row",
        )
    logo_plots = []
    max_y_vals = []

    if not isinstance(axes, np.ndarray):
        axes = [axes]

    for i, ax in enumerate([axes] if len(kmers) == 1 else axes):
        logo_plots.append(
            plot_motif(df, allele_col, allele, kmers[i], ax=ax, logo_scale=logo_scale)
        )
        max_y_vals.append(np.ceil(ax.get_ylim()[1] * 5 + 1) * 0.2)

    max_y = np.max(max_y_vals)
    for i, logos in enumerate(logo_plots):
        logos.ax.text(
            3,
            max_y + 0.1,
            f"n={len(df[(df[allele_col] == allele) & (df.nAA == kmers[i])])}",
        )
    adjust_axes(logo_plots, max_y)
    return logo_plots




[docs]
def plot_motif(df, allele_col, allele, kmer, ax=None, logo_scale=20):
    """Plots a sequence logo for a specific allele and k-mer length.

    Args:
        df: Input DataFrame with sequences and allele info.
        allele_col: Column name for allele.
        allele: Allele name to filter.
        kmer: Length of peptide to analyze.
        ax: Matplotlib axis to plot on. If None, uses current axis.
        logo_scale: Scaling factor for motif information content.

    Returns:
        A Logomaker Logo object.
    """
    motif_df = count_motif_bits(df, allele_col, allele, kmer, logo_scale=logo_scale)
    logo_plot = lm.Logo(
        motif_df,
        font_name="DejaVu Sans",
        color_scheme="chemistry",
        ax=ax,
    )
    logo_plot.style_xticks(anchor=0, spacing=1, rotation=0)
    return logo_plot




[docs]
def count_motif_bits(df, allele_col, allele, kmer, logo_scale=20):
    """Counts amino acid frequencies at each position and computes information content.

    Args:
        df: Input DataFrame with sequences.
        allele_col: Column name for allele.
        allele: Allele to filter.
        kmer: Peptide length to consider.
        logo_scale: Scaling factor for log-odds (controls logo height).

    Returns:
        A DataFrame of information content per position (rows) and amino acid (columns).
    """
    df = df[(df[allele_col] == allele) & (df.nAA == kmer)]
    # print(f"allele={allele}, kmer={kmer}, n={len(df)}")
    data = np.zeros((kmer, 26), dtype=float)
    for seq in df.sequence.values:
        data[(np.arange(kmer), np.array(seq, "c").view(np.int8) - ord("A"))] += 1
    data /= len(df)
    df = pd.DataFrame(data=data, columns=list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))

    df = df[list("ACDEFGHIKLMNPQRSTVWY")]
    df = df.apply(lambda p: p * np.log2(p * logo_scale + 1e-100))
    # df.values[df.isin([np.nan, np.inf, -np.inf])] = np.min(
    #     np.ma.masked_invalid(df.sum(axis=1))
    # )
    df.values[df.isin([np.nan, np.inf, -np.inf])] = 0
    return df




[docs]
def adjust_axes(logo_plots, max_y):
    """Adjusts y-axis limits of logo plots to a common maximum.

    Args:
        logo_plots: List of Logomaker Logo objects (or nested lists).
        max_y: Target maximum y-value for all plots.
    """
    for logos in logo_plots:
        if isinstance(logos, list):
            logos[0].ax.set_ylim(top=max_y)
            # for logo in logos:
            #     if max_y < logo.ax.get_ylim()[1]:
            #         max_y = logo.ax.get_ylim()[1]
            #     else:
            #         logo.ax.set_ylim(top=np.ceil(max_y*5+1)*0.2)
        else:
            logos.ax.set_ylim(top=max_y)
            break




[docs]
def select_optimal_a_cover(a_to_b_map, uncovered_b, coverage_threshold, max_a_elements):
    """Greedy selection of 'a' elements to maximize coverage of 'b' elements.

    Implements a greedy set cover algorithm: selects 'a' such that their mapped 'b'
    set covers the most uncovered elements iteratively.

    Args:
        a_to_b_map: Mapping from elements of set A to sets of elements in B.
        uncovered_b: Initial set of uncovered elements in B.
        coverage_threshold: Target fraction of B to cover (e.g., 0.9 for 90%).
        max_a_elements: Maximum number of 'a' elements to select.

    Returns:
        A set of selected 'a' elements.
    """
    total_b = len(uncovered_b)
    selected_a = set()
    current_coverage = 0.0
    selected_count = 0

    while current_coverage < coverage_threshold and selected_count < max_a_elements:
        best_a = None
        best_coverage_count = 0

        for a, b_set in a_to_b_map.items():
            if a in selected_a:
                continue

            cover_count = sum(1 for b in b_set if b in uncovered_b)

            if cover_count > best_coverage_count:
                best_a = a
                best_coverage_count = cover_count

        if best_a is not None:
            selected_a.add(best_a)
            selected_count += 1

            for b in a_to_b_map[best_a]:
                if b in uncovered_b:
                    uncovered_b.remove(b)

            current_coverage = 1.0 - len(uncovered_b) / total_b
        else:
            break

    return selected_a