Source code for metachat.preprocessing._identifyLRC

# ============================================================
import numpy as np
import pandas as pd

from pydpc import Cluster

import matplotlib as mpl
import matplotlib.pyplot as plt

import anndata
# ============================================================


[docs]
def LRC_unfiltered(
    adata: anndata.AnnData,
    LRC_name: str = None,
    LRC_source: str = "marker",
    obs_name: str = None,  
    quantile: float = 90.0,
    copy: bool = False
):
    """
    Identify unfiltered candidate LRC (long-range channel) spots based on the quantile of a marker feature.

    This function selects candidate points whose marker feature (e.g., gene expression or score)
    exceeds a specified quantile threshold. The result is stored in
    ``adata.obs['LRC_<LRC_name>_<LRC_source>_unfiltered']`` as categorical values (0 or 1).

    Parameters
    ----------
    adata : anndata.AnnData
        Annotated data matrix with shape ``n_obs × n_var``.
    LRC_name : str
        The name of the long-range channel (e.g., ``'Blood'`` or ``'CSF'``).
    LRC_source : str, default='marker'
        The type of feature used for selection (e.g., ``'marker'``, ``'score'``).
        This will be included in the generated column name.
    obs_name : str
        The key in ``adata.obs`` containing the numeric feature used for quantile selection.
    quantile : float, default=90.0
        The percentile threshold (0–100).  
        Example: 90.0 means select all points above the 90th percentile.
    copy : bool, default=False
        If True, returns a copy of the modified AnnData object.  
        Otherwise modifies the input object in place and returns None.

    Returns
    -------
    adata : anndata.AnnData or None
        If ``copy=True``, returns a copy of the AnnData with a new column  
        ``'LRC_<LRC_name>_<LRC_source>_unfiltered'`` in ``.obs``.
        Otherwise, modifies in place and returns None.

    Notes
    -----
    The resulting column is stored as a pandas ``Categorical`` with values {0, 1}.
    """
    
    # ==== Validate inputs ====
    assert LRC_name is not None, "Please provide an LRC_name."
    assert obs_name is not None, "Please provide an obs_name."

    # ==== Identify candidate cells ====
    threshold = np.percentile(adata.obs[obs_name].values, q=quantile)
    candidate_cells = adata.obs[obs_name].values.flatten() > threshold
    candidate_cells_int = candidate_cells.astype(int)
    candidate_cells_cat = pd.Categorical(candidate_cells_int)

    # ==== Store results ====
    key_name = f"LRC_{LRC_name}_{LRC_source}_unfiltered"
    adata.obs[key_name] = candidate_cells_cat

    print(f"Cells above the {quantile}% have been selected as candidates and stored in 'adata.obs['LRC_{LRC_name}_{LRC_source}_unfiltered']'.")

    return adata if copy else None



[docs]
def LRC_cluster(
    adata: anndata.AnnData, 
    LRC_name: str = None,
    LRC_source: str = "marker",
    spatial_index: str = "spatial",
    density_cutoff: float = 10.0,
    delta_cutoff: float = 10.0,
    outlier_cutoff: float = 2.0, 
    fraction: float = 0.02,
    plot_savepath: str = None
):
    """
    Perform local density clustering on unfiltered LRC candidate points.

    This function applies a density–delta based clustering (as implemented in `pydpc.dpc.Cluster`)
    to identify candidate regions corresponding to a specific long-range channel (LRC).
    The results are visualized as density–delta plots and spatial cluster assignments.

    Parameters
    ----------
    adata : anndata.AnnData
        Annotated data matrix (``n_obs × n_var``) containing spatial coordinates.
    LRC_name : str
        Name of the long-range channel (e.g. ``'Blood'`` or ``'CSF'``).
    LRC_source : str, default='marker'
        Type of source feature used for identifying LRC candidates (included in the key name).
    spatial_index : str, default='spatial'
        Key in ``adata.obsm`` storing spatial coordinates for clustering.
    density_cutoff : float, default=10.0
        Threshold for selecting cluster centers based on local density.
    delta_cutoff : float, default=10.0
        Threshold for selecting cluster centers based on delta distance.
    outlier_cutoff : float, default=2.0
        Density cutoff for filtering out low-density outliers.
    fraction : float, default=0.02
        Fraction of points relative to total used to estimate local density and delta.
    plot_savepath : str, optional
        Path to save the clustering diagnostic plots (e.g., ``'results/LRC_cluster.png'``).
        If None, the plot will be displayed interactively.

    Returns
    -------
    LRC_cluster : pydpc.dpc.Cluster
        The cluster object containing attributes such as `density`, `delta`,
        `membership`, and `outlier`, which can be used as input for
        :func:`mc.pp.LRC_filtered`.

    Notes
    -----
    The function requires that :func:`mc.pp.LRC_unfiltered` has been run beforehand,
    which stores unfiltered LRC candidates in ``adata.obs['LRC_<LRC_name>_<LRC_source>_unfiltered']``.
    """

    # ==== Validate inputs ====
    assert LRC_name is not None, "Please provide an LRC name."
    key = f"LRC_{LRC_name}_{LRC_source}_unfiltered"
    if key not in adata.obs.keys():
        raise KeyError("Please run the mc.pp.LRC_unfiltered function first.")

    # ==== Extract spatial coordinates ====
    LRC_cellsIndex = adata.obs[key].astype(bool)
    points = adata[LRC_cellsIndex,:].obsm[spatial_index].toarray().astype('double')

    # ==== Run local density clustering ====
    LRC_cluster = Cluster(points, fraction, autoplot=False)
    LRC_cluster.autoplot = False
    LRC_cluster.assign(density_cutoff, delta_cutoff)

    # ==== Identify outliers ====
    LRC_cluster.outlier = LRC_cluster.border_member
    LRC_cluster.outlier[LRC_cluster.density <= outlier_cutoff] = True
    LRC_cluster.outlier[LRC_cluster.density > outlier_cutoff] = False
    
    # ==== Plot results ====
    if points.shape[1] == 2:
        fig, ax = plt.subplots(1,2,figsize=(10, 5))
        # Plot density vs. delta in the first subplot
        ax[0].scatter(LRC_cluster.density, LRC_cluster.delta, s=10)
        ax[0].plot([LRC_cluster.min_density, LRC_cluster.density.max()], [LRC_cluster.min_delta, LRC_cluster.min_delta], linewidth=2, color="red")
        ax[0].plot([LRC_cluster.min_density, LRC_cluster.min_density], [LRC_cluster.min_delta,  LRC_cluster.delta.max()], linewidth=2, color="red")
        ax[0].plot([outlier_cutoff, outlier_cutoff], [0,  LRC_cluster.delta.max()], linewidth=2, color="red", linestyle='--')
        ax[0].set_xlabel(r"density")
        ax[0].set_ylabel(r"delta / a.u.")
        ax[0].set_box_aspect(1)
        
        # Plot the spatial distribution of points in the second subplot
        ax[1].scatter(points[~LRC_cluster.outlier,0], points[~LRC_cluster.outlier,1], s=5, c=LRC_cluster.membership[~LRC_cluster.outlier], cmap=mpl.cm.tab10)
        ax[1].scatter(points[LRC_cluster.outlier,0], points[LRC_cluster.outlier,1], s=5, c="grey")
        ax[1].invert_yaxis()
        ax[1].set_box_aspect(1)
    elif points.shape[1] == 3:
        fig, ax = plt.subplots(figsize=(5, 5))
        # Plot density vs. delta in the first subplot
        ax.scatter(LRC_cluster.density, LRC_cluster.delta, s=10)
        ax.plot([LRC_cluster.min_density, LRC_cluster.density.max()], [LRC_cluster.min_delta, LRC_cluster.min_delta], linewidth=2, color="red")
        ax.plot([LRC_cluster.min_density, LRC_cluster.min_density], [LRC_cluster.min_delta,  LRC_cluster.delta.max()], linewidth=2, color="red")
        ax.plot([outlier_cutoff, outlier_cutoff], [0,  LRC_cluster.delta.max()], linewidth=2, color="red", linestyle='--')
        ax.set_xlabel(r"density")
        ax.set_ylabel(r"delta / a.u.")
        ax.set_box_aspect(1)

    # ==== Save & Return ====
    if plot_savepath is not None:
        plt.savefig(plot_savepath)
        print(f"Plot saved to: {plot_savepath}")
    else:
        plt.show()

    # Return the cluster object
    return LRC_cluster



[docs]
def LRC_filtered(
    adata: anndata.AnnData, 
    LRC_name: str = None,
    LRC_cluster = None,
    LRC_source: str = "marker",
    copy: bool = False
):
    """
    Assign final LRC (long-range channel) clusters after local density clustering.

    This function uses the cluster assignment results from :func:`mc.pp.LRC_cluster`
    to label candidate LRC points and remove outliers. The output is stored in
    ``adata.obs['LRC_<LRC_name>_<LRC_source>_filtered']``.

    Parameters
    ----------
    adata : anndata.AnnData
        Annotated data matrix (``n_obs × n_var``).
    LRC_name : str
        Name of the long-range channel (e.g. ``'Blood'`` or ``'CSF'``).
    LRC_cluster : pydpc.dpc.Cluster
        The clustering object returned by :func:`mc.pp.LRC_cluster`.
    LRC_source : str, default='marker'
        Type of feature used for LRC identification (included in the key name).
    copy : bool, default=False
        If True, return a copy of the modified AnnData.
        Otherwise, modify in place and return None.

    Returns
    -------
    adata : anndata.AnnData or None
        The AnnData object with a new categorical column
        ``'LRC_<LRC_name>_<LRC_source>_filtered'`` in ``.obs``.
        Cluster numbers indicate LRC cluster IDs (starting from 1),
        while 0 indicates non-LRC or outlier points.
        Returns None if ``copy=False``.

    Notes
    -----
    This function should be run **after** both :func:`mc.pp.LRC_unfiltered` and :func:`mc.pp.LRC_cluster`. 
    """
    
    # ==== Validate inputs ====
    assert LRC_name is not None, "Please provide an LRC name."
    assert LRC_cluster is not None, "Please provide LRC_cluster."
    key = f"LRC_{LRC_name}_{LRC_source}_unfiltered"
    if key not in adata.obs.keys():
        raise KeyError(
            "Please run the 'mc.pp.LRC_unfiltered' and 'mc.pp.LRC_cluster' function first"
        )

    # ==== Compute filtered cluster ====
    newcluster = LRC_cluster.membership + 1
    newcluster[LRC_cluster.outlier] = 0

    # ==== Store results ====
    key_filtered = f"LRC_{LRC_name}_{LRC_source}_filtered"
    adata.obs[key_filtered] = adata.obs[key].astype(int)
    adata.obs[key_filtered][adata.obs[key_filtered] == 1] = newcluster
    adata.obs[key_filtered] = adata.obs[key_filtered].astype('category')

    print(
        f"Candidate points for {LRC_name} LRC are clustered and outliers are removed. "
        f"LRC points are stored in 'adata.obs['LRC_{LRC_name}_{LRC_source}_filtered']'."
    )

    return adata if copy else None