Source code for metachat.preprocessing._importData

import io
import pkgutil
import anndata
import numpy as np
import pandas as pd
from typing import Optional


[docs]
def MetaChatDB(
    species = "mouse"
):
    """
    Extract metabolite-sensor pairs from MetaChatDB.

    Parameters
    ----------
    species
        The species of the ligand-receptor pairs. Choose between 'mouse' and 'human'.

    Returns
    -------
    df_metasen : pandas.DataFrame
        A pandas DataFrame of the MS pairs with the six columns representing the Metabolite, Sensor, Metabolite.Pathway, Sensor.Pathway, Metabolite.Names, Long.Range.Channel respectively.

    """
    
    data = pkgutil.get_data(__name__, "_data/MetaChatDB/MetaChatDB_"+species+".tsv")
    df_metasen = pd.read_csv(io.BytesIO(data), sep='\t')

    return df_metasen


def scFEA_annotation(
):
    
    data = pkgutil.get_data(__name__, "_data/scFEA/metabo2module.csv")
    met_annota = pd.read_csv(io.BytesIO(data), sep=',')

    return met_annota

def compass_annotation(
):

    data = pkgutil.get_data(__name__, "_data/Compass/met_md.csv")
    met_annota = pd.read_csv(io.BytesIO(data), sep=',')

    return met_annota


[docs]
def generate_adata_met_scFEA(
    data_path: str
):
    """
    Generate processed metabolite matrix for scFEA analysis.

    Parameters
    ----------
    data_path : str
        Path to the metabolite data file (CSV format).

    Returns
    -------
    adata_met : pandas.DataFrame
        Processed metabolite adata object ready for downstream analysis.
    """
    mat_met = pd.read_csv(data_path, index_col=0)
    met_annota = scFEA_annotation()
    mat_met.columns = met_annota['HMDB.ID']
    mat_met[mat_met < 0] = 0

    adata_met = anndata.AnnData(mat_met)

    return  adata_met



[docs]
def generate_adata_met_compass(
    compass_output: str,
    score: str = "Balance",
    norm: str = "rank",
    agg: str = "topk_mean",
    topk: int = 5,
    **score_kwargs
):
    """
    Generate processed metabolite matrix for COMPASS analysis using
    reaction-level penalty scores.

    Metabolite-level production / consumption scores are derived from COMPASS
    reaction penalties via stoichiometric aggregation across all internal
    reactions that involve each cytoplasmic metabolite.  Only metabolites
    in the ``[c]`` compartment are retained and mapped to HMDB IDs.

    Reaction metadata (``rxn_md.csv``) and metabolite annotation
    (``met_md.csv``) are loaded from the package's built-in data.

    Parameters
    ----------
    compass_output : str
        Path to the COMPASS output file.
    score : str
        Score column to extract for each metabolite.  One of ``"Prod"``,
        ``"Cons"``, or ``"Balance"`` (default).
        ``"Balance"`` is production minus consumption.
    norm : str
        Per-reaction normalisation across cells before aggregation.
        ``"rank"`` (default) uses percentile rank; ``"minmax"`` uses
        min-max scaling.
    agg : str
        Aggregation method across reactions.  ``"topk_mean"`` (default)
        averages the top-*k* reactions per cell; ``"max"`` takes the
        maximum.
    topk : int
        Number of top reactions used when ``agg="topk_mean"``. Default 5.
    **score_kwargs
        Additional keyword arguments forwarded to
        :func:`_compass_utils.compute_metabolite_scores`, e.g.
        ``exclude_transport=False``.

    Returns
    -------
    adata_met : anndata.AnnData
        AnnData with shape *(cells × metabolites)*.
        ``obs`` index = cell IDs; ``var`` index = HMDB IDs.
    """
    from ._compass_utils import compute_metabolite_scores

    compass_rxn_mat = pd.read_csv(compass_output, sep="\t", index_col=0)

    rxn_md_data = pkgutil.get_data(__name__, "_data/Compass/rxn_md.csv")
    rxn_md_df = pd.read_csv(io.BytesIO(rxn_md_data), index_col=0)

    met_md = compass_annotation()
    met_md_c = met_md[
        (met_md["compartment"] == "[c]") &
        met_md["ID"].str.startswith("HMDB", na=False)
    ].copy()

    cells = compass_rxn_mat.columns
    records = {}  # met (e.g. "h[c]") -> score Series

    for _, row in met_md_c.iterrows():
        query = f"{row['metName']} [c]"
        met_id = row["met"]

        scores_df, _ = compute_metabolite_scores(
            rxn_md_df, compass_rxn_mat, query,
            norm=norm, agg=agg, topk=topk, **score_kwargs
        )

        records[met_id] = scores_df[score]

    met_mat = pd.DataFrame(records, index=cells)
    met_mat[met_mat < 0] = 0

    # Map met IDs to HMDB IDs; drop duplicate HMDB columns (keep first)
    met_to_hmdb = met_md_c.set_index("met")["ID"]
    met_mat.columns = met_mat.columns.map(met_to_hmdb)
    met_mat = met_mat.loc[:, ~met_mat.columns.duplicated(keep="first")]

    return anndata.AnnData(met_mat)


def generate_adata_met_compass_condition(
    compass_outputs: list,
    score: str = "Balance",
    norm: str = "rank",
    agg: str = "topk_mean",
    topk: int = 5,
    **score_kwargs
):
    """
    Generate metabolite matrices for multiple COMPASS conditions with joint ranking.

    All reaction penalty matrices are concatenated before any normalisation
    so that per-reaction rank (or min-max) scaling is computed across all cells
    from all conditions simultaneously.  This makes scores comparable across
    conditions.  Results are then split back and returned as a list of AnnData
    objects in the same order as the input paths.

    Parameters
    ----------
    compass_outputs : list of str
        Paths to the COMPASS reaction penalties TSV files, one per condition.
    score : str
        Score column to extract.  One of ``"Prod"``, ``"Cons"``, or
        ``"Balance"`` (default).
    norm : str
        Per-reaction normalisation across cells.  ``"rank"`` (default)
        or ``"minmax"``.
    agg : str
        Aggregation method.  ``"topk_mean"`` (default) or ``"max"``.
    topk : int
        Number of top reactions used when ``agg="topk_mean"``. Default 5.
    **score_kwargs
        Additional keyword arguments forwarded to
        :func:`_compass_utils.compute_metabolite_scores`.

    Returns
    -------
    list of anndata.AnnData
        One AnnData per condition *(cells × metabolites)*, in the same order
        as ``compass_outputs``.

    Notes
    -----
    Cell IDs do not need to be unique across conditions — duplicate barcodes
    are handled internally with a temporary prefix.  Only reactions present
    in every file (inner join) are used for scoring.
    """
    import warnings
    from ._compass_utils import compute_metabolite_scores

    mats = [pd.read_csv(p, sep="\t", index_col=0) for p in compass_outputs]
    cells_per_cond = [m.columns.tolist() for m in mats]

    # Add a temporary numeric prefix to make column names globally unique,
    # so that identical barcodes across conditions don't collide after concat.
    prefixed_mats = []
    for i, mat in enumerate(mats):
        renamed = mat.copy()
        renamed.columns = [f"_c{i}_" + c for c in mat.columns]
        prefixed_mats.append(renamed)

    # Joint matrix — rank is computed across all cells from all conditions
    n_rxns_before = len(mats[0].index)
    compass_rxn_mat = pd.concat(prefixed_mats, axis=1, join="inner")
    n_dropped = n_rxns_before - len(compass_rxn_mat.index)
    if n_dropped > 0:
        warnings.warn(
            f"{n_dropped} reaction(s) not shared across all files "
            "were dropped (inner join)."
        )

    rxn_md_data = pkgutil.get_data(__name__, "_data/Compass/rxn_md.csv")
    rxn_md_df = pd.read_csv(io.BytesIO(rxn_md_data), index_col=0)

    met_md = compass_annotation()
    met_md_c = met_md[
        (met_md["compartment"] == "[c]") &
        met_md["ID"].str.startswith("HMDB", na=False)
    ].copy()

    cells = compass_rxn_mat.columns
    records = {}  # met (e.g. "h[c]") -> score Series

    for _, row in met_md_c.iterrows():
        query = f"{row['metName']} [c]"
        met_id = row["met"]

        scores_df, _ = compute_metabolite_scores(
            rxn_md_df, compass_rxn_mat, query,
            norm=norm, agg=agg, topk=topk, **score_kwargs
        )

        records[met_id] = scores_df[score]

    if not records:
        raise ValueError(
            "No cytoplasmic metabolites could be scored. "
            "Verify that the input files and built-in rxn_md are compatible."
        )

    met_mat = pd.DataFrame(records, index=cells)
    met_mat[met_mat < 0] = 0

    # Map met IDs to HMDB IDs; drop duplicate HMDB columns (keep first)
    met_to_hmdb = met_md_c.set_index("met")["ID"]
    met_mat.columns = met_mat.columns.map(met_to_hmdb)
    met_mat = met_mat.loc[:, ~met_mat.columns.duplicated(keep="first")]

    # Split by prefixed column names, then restore original barcodes as index
    result = []
    for orig_cells, renamed_mat in zip(cells_per_cond, prefixed_mats):
        chunk = met_mat.loc[renamed_mat.columns].copy()
        chunk.index = orig_cells
        result.append(anndata.AnnData(chunk))
    return result



[docs]
def generate_adata_met_mebocost(
    data_path: str
):
    """
    Generate processed metabolite matrix for scFEA analysis.

    Parameters
    ----------
    data_path : str
        Path to the metabolite data file (CSV format).

    Returns
    -------
    met_mat : pandas.DataFrame
        Processed metabolite matrix ready for downstream analysis.
    """

    mat_met = pd.read_csv(data_path, index_col=0)
    mat_met[mat_met < 0] = 0
    adata_met = anndata.AnnData(mat_met.T)

    return adata_met