Source code for deepmol.unsupervised.umap

from typing import Tuple

import numpy as np
import umap

from deepmol.datasets import Dataset
from deepmol.unsupervised.base_unsupervised import UnsupervisedLearn
import plotly.express as px


[docs]class UMAP(UnsupervisedLearn):
    """
    Class to perform Uniform Manifold Approximation and Projection (UMAP).

    Wrapper around umap package.
    (https://github.com/lmcinnes/umap)
    """

    def __init__(self, parametric: bool = True, **kwargs):
        """
        Initialize UMAP.

        Parameters
        ----------
        parametric : bool
                If True, use parametric UMAP.
        kwargs:
            Additional keyword arguments for the UMAP class (see https://github.com/lmcinnes/umap). Includes:
            n_neighbors : int
                The size of local neighborhood.
            n_components : int
                The dimension of the space to embed into.
            metric : str
                The metric to use for the computation.
            n_epochs : int
                The number of training epochs to use when optimizing the low dimensional embedding.
            learning_rate : float
                The initial learning rate for the embedding optimization.
            low_memory : bool
                If True, use a more memory efficient nearest neighbor implementation.
            random_state : int
                The random seed to use.
        """
        super().__init__()
        if parametric:
            self.umap = umap.parametric_umap.ParametricUMAP(**kwargs)
        else:
            self.umap = umap.UMAP(**kwargs)

    def _run_unsupervised(self, dataset: Dataset, **kwargs) -> Tuple[np.ndarray, np.ndarray]:
        """
        Compute cluster centers and predict cluster index for each sample.

        Parameters
        ----------
        dataset : Dataset
            The dataset to run the unsupervised learning on.
        kwargs:
            Additional keyword arguments for the UMAP class.

        Returns
        -------
        x_new : np.ndarray
            The new features.
        feature_names : np.ndarray
            The names of the new features.
        """
        self.dataset = dataset
        x_new = self.umap.fit_transform(dataset.X)
        feature_names = np.array([f'UMAP_{i}' for i in range(x_new.shape[1])])
        return x_new, feature_names

[docs]    def plot(self, x_new: np.ndarray, path: str = None, **kwargs) -> None:
        """
        Plot the UMAP embedding.

        Parameters
        ----------
        x_new : np.ndarray
            The new features.
        path : str
            The path to save the plot.
        kwargs:
            Additional keyword arguments for the plot.
        """
        self.logger.info(f'{x_new.shape[1]} Components UMAP: ')

        if self.dataset.mode == 'classification':
            y = [str(i) for i in self.dataset.y]
        else:
            y = self.dataset.y

        if x_new.shape[1] == 2:
            fig = px.scatter(x_new, x=0, y=1, color=y,
                             labels={'0': 'UMAP 1', '1': 'UMAP 2', 'color': self.dataset.label_names[0]}, **kwargs)
        elif x_new.shape[1] == 3:
            fig = px.scatter_3d(x_new, x=0, y=1, z=2, color=y,
                                labels={'0': 'UMAP 1', '1': 'UMAP 2', '2': 'UMAP 3', 'color': self.dataset.label_names[0]})
        else:
            labels = {str(i): f"UMAP {i + 1}" for i in range(x_new.shape[1])}
            labels['color'] = self.dataset.label_names[0]
            fig = px.scatter_matrix(x_new,
                                    color=y,
                                    dimensions=range(x_new.shape[1]),
                                    labels=labels,
                                    **kwargs)
            fig.update_traces(diagonal_visible=False)
        fig.show()
        if path is not None:
            fig.write_image(path)

    def _fit(self, dataset: Dataset) -> 'UMAP':
        """
        Fit the model with dataset.X.

        Parameters
        ----------
        dataset: Dataset
            The dataset to perform unsupervised learning.

        Returns
        -------
        self: TSNE
            The fitted model.
        """
        self.dataset = dataset
        self.umap.fit(dataset.X)
        return self

    def _transform(self, dataset: Dataset) -> Dataset:
        """
        Apply dimensionality reduction on dataset.X.

        Parameters
        ----------
        dataset: Dataset
            The dataset to perform unsupervised learning.

        Returns
        -------
        dataset: Dataset
            The transformed dataset.
        """
        dataset._X = self.umap.transform(dataset.X)
        dataset.feature_names = np.array([f'UMAP_{i}' for i in range(dataset.X.shape[1])])
        return dataset