Source code for deepmol.unsupervised.umap

from typing import Tuple

import numpy as np
import umap

from deepmol.datasets import Dataset
from deepmol.unsupervised.base_unsupervised import UnsupervisedLearn
import plotly.express as px


[docs]class UMAP(UnsupervisedLearn): """ Class to perform Uniform Manifold Approximation and Projection (UMAP). Wrapper around umap package. (https://github.com/lmcinnes/umap) """ def __init__(self, parametric: bool = True, **kwargs): """ Initialize UMAP. Parameters ---------- parametric : bool If True, use parametric UMAP. kwargs: Additional keyword arguments for the UMAP class (see https://github.com/lmcinnes/umap). Includes: n_neighbors : int The size of local neighborhood. n_components : int The dimension of the space to embed into. metric : str The metric to use for the computation. n_epochs : int The number of training epochs to use when optimizing the low dimensional embedding. learning_rate : float The initial learning rate for the embedding optimization. low_memory : bool If True, use a more memory efficient nearest neighbor implementation. random_state : int The random seed to use. """ super().__init__() if parametric: self.umap = umap.parametric_umap.ParametricUMAP(**kwargs) else: self.umap = umap.UMAP(**kwargs) def _run_unsupervised(self, dataset: Dataset, **kwargs) -> Tuple[np.ndarray, np.ndarray]: """ Compute cluster centers and predict cluster index for each sample. Parameters ---------- dataset : Dataset The dataset to run the unsupervised learning on. kwargs: Additional keyword arguments for the UMAP class. Returns ------- x_new : np.ndarray The new features. feature_names : np.ndarray The names of the new features. """ self.dataset = dataset x_new = self.umap.fit_transform(dataset.X) feature_names = np.array([f'UMAP_{i}' for i in range(x_new.shape[1])]) return x_new, feature_names
[docs] def plot(self, x_new: np.ndarray, path: str = None, **kwargs) -> None: """ Plot the UMAP embedding. Parameters ---------- x_new : np.ndarray The new features. path : str The path to save the plot. kwargs: Additional keyword arguments for the plot. """ self.logger.info(f'{x_new.shape[1]} Components UMAP: ') if self.dataset.mode == 'classification': y = [str(i) for i in self.dataset.y] else: y = self.dataset.y if x_new.shape[1] == 2: fig = px.scatter(x_new, x=0, y=1, color=y, labels={'0': 'UMAP 1', '1': 'UMAP 2', 'color': self.dataset.label_names[0]}, **kwargs) elif x_new.shape[1] == 3: fig = px.scatter_3d(x_new, x=0, y=1, z=2, color=y, labels={'0': 'UMAP 1', '1': 'UMAP 2', '2': 'UMAP 3', 'color': self.dataset.label_names[0]}) else: labels = {str(i): f"UMAP {i + 1}" for i in range(x_new.shape[1])} labels['color'] = self.dataset.label_names[0] fig = px.scatter_matrix(x_new, color=y, dimensions=range(x_new.shape[1]), labels=labels, **kwargs) fig.update_traces(diagonal_visible=False) fig.show() if path is not None: fig.write_image(path)
def _fit(self, dataset: Dataset) -> 'UMAP': """ Fit the model with dataset.X. Parameters ---------- dataset: Dataset The dataset to perform unsupervised learning. Returns ------- self: TSNE The fitted model. """ self.dataset = dataset self.umap.fit(dataset.X) return self def _transform(self, dataset: Dataset) -> Dataset: """ Apply dimensionality reduction on dataset.X. Parameters ---------- dataset: Dataset The dataset to perform unsupervised learning. Returns ------- dataset: Dataset The transformed dataset. """ dataset._X = self.umap.transform(dataset.X) dataset.feature_names = np.array([f'UMAP_{i}' for i in range(dataset.X.shape[1])]) return dataset