Source code for deepmol.datasets.datasets

import uuid
import warnings
from abc import ABC, abstractmethod
from copy import copy, deepcopy
from typing import Union, List, Tuple

import numpy as np
import pandas as pd
from rdkit.Chem import Mol, SDWriter

from deepmol.loggers.logger import Logger
from deepmol.datasets._utils import merge_arrays, merge_arrays_of_arrays
from deepmol.utils.cached_properties import deepmol_cached_property
from deepmol.utils.decorators import inplace_decorator
from deepmol.utils.utils import smiles_to_mol, mol_to_smiles


[docs]class Dataset(ABC): """ Abstract base class for datasets Subclasses need to implement their own methods based on this class. """ def __init__(self): self.logger = Logger()
[docs] def clear_cached_properties(self): """ Clears the cached properties of the class. """ for name in dir(type(self)): if isinstance(getattr(type(self), name), deepmol_cached_property): vars(self).pop(name, None)
@abstractmethod def __len__(self) -> int: """ Get the length of the dataset. It returns the number of molecules in the dataset. """ @property @abstractmethod def smiles(self) -> np.ndarray: """ Get the smiles in the dataset. Returns ------- mols : np.ndarray Molecule smiles in the dataset. """ @smiles.setter @abstractmethod def smiles(self, value: Union[List[str], np.ndarray]) -> None: """ Set the molecules in the dataset. Parameters ---------- value: Union[List[str], np.ndarray] The molecules to set in the dataset. """ @property @abstractmethod def mols(self) -> np.ndarray: """ Get the molecules in the dataset. Returns ------- mols : np.ndarray Molecules in the dataset. """ @property @abstractmethod def removed_elements(self) -> np.ndarray: """ Get the molecules in the dataset. Returns ------- mols : np.ndarray Removed molecules in the dataset. """ @removed_elements.setter @abstractmethod def removed_elements(self, value: Union[List[str], np.ndarray]) -> None: """ Set the molecules in the dataset. Parameters ---------- value: Union[List[str], np.ndarray] The removed elements in the dataset. """ @mols.setter @abstractmethod def mols(self, value: Union[List[str], np.ndarray]) -> None: """ Set the molecules in the dataset. Parameters ---------- value: Union[List[str], np.ndarray] The molecules to set in the dataset. """ @property @abstractmethod def X(self) -> np.ndarray: """ Get the features in the dataset. Returns ------- X: np.ndarray The features in the dataset. """ @property @abstractmethod def y(self) -> np.ndarray: """ Get the labels in the dataset. Returns ------- y: np.ndarray The labels in the dataset. """ @y.setter @abstractmethod def y(self, value: Union[List, np.ndarray]) -> None: """ Set the labels in the dataset. Parameters ---------- value: Union[List, np.ndarray] The labels to set in the dataset. """ @property @abstractmethod def ids(self) -> np.ndarray: """ Get the ids in the dataset. Returns ------- ids: np.ndarray The ids in the dataset. """ @ids.setter @abstractmethod def ids(self, value: Union[List, np.ndarray]) -> None: """ Set the ids in the dataset. Parameters ---------- value: Union[List[str], np.ndarray] The ids to set in the dataset. """ @property @abstractmethod def feature_names(self) -> np.ndarray: """ Get the feature labels of the molecules in the dataset. Returns ------- feature_names: np.ndarray Feature names of the molecules. """ @feature_names.setter @abstractmethod def feature_names(self, value: Union[List, np.ndarray]) -> None: """ Set the feature labels of the molecules in the dataset. Parameters ---------- value: Union[List, np.ndarray] Feature names of the molecules. """ @property @abstractmethod def label_names(self) -> np.ndarray: """ Get the labels names of the dataset. If you have a single task this will be a list of length 1 with the name of the label. If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels. Returns ------- label_names: np.ndarray Label names of the molecules. """ @label_names.setter @abstractmethod def label_names(self, value: Union[List, np.ndarray]) -> None: """ Set the labels names of the dataset. If you have a single task this will be a list of length 1 with the name of the label. If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels. Parameters ---------- value: Union[List, np.ndarray] Label names of the molecules. """ @property @abstractmethod def n_tasks(self) -> int: """ Get the number of tasks in the dataset. Returns ------- n_tasks: int The number of tasks in the dataset. """ @n_tasks.setter @abstractmethod def n_tasks(self, value: int) -> None: """ Set the number of tasks in the dataset. Parameters ---------- value: int The number of tasks in the dataset. """ @property @abstractmethod def mode(self) -> Union[str, List[str]]: """ Get the mode of the dataset. Returns ------- mode: Union[str, List[str]] The mode of the dataset. """ @mode.setter def mode(self, value: Union[str, List[str]]) -> None: """ Set the mode of the dataset. Parameters ---------- value: Union[str, List[str]] The mode of the dataset. """
[docs] @abstractmethod def get_shape(self) -> tuple: """ Get the shape of molecules, features and labels in the dataset. Returns ------- shape: tuple The shape of molecules, features and labels. """
[docs] @abstractmethod def remove_nan(self, axis: int = 0) -> None: """ Remove the nan values from the dataset. Parameters ---------- axis: int The axis to remove the nan values. """
[docs] @abstractmethod def remove_elements(self, indexes: List) -> None: """ Remove the elements from the dataset. Parameters ---------- indexes: List[int] The indexes of the elements to remove. """
[docs] @abstractmethod def select_features_by_index(self, indexes: List[int]) -> 'Dataset': """ Select the features from the dataset. Parameters ---------- indexes: List[int] The indexes of the features to select. """
[docs] @abstractmethod def select_features_by_name(self, names: List[str]) -> None: """ Select features with specific names from the dataset Parameters ---------- names: List[str] The names of the features to select from the dataset. """
[docs] @abstractmethod def select(self, indexes: List[int], axis: int = 0) -> None: """ Select the elements from the dataset. Parameters ---------- indexes: List[int] The indexes of the elements to select. axis: int The axis to select the elements. """
[docs] @abstractmethod def select_to_split(self, indexes: Union[np.ndarray, List[int]]) -> 'Dataset': """ Select the elements from the dataset to split. Parameters ---------- indexes: Union[np.ndarray, List[int]] The indexes of the elements to select. """
[docs]class SmilesDataset(Dataset): """ A Dataset defined by in-memory numpy arrays. This subclass of 'Dataset' stores arrays for smiles strings, Mol objects, features X, labels y, and molecule ids in memory as numpy arrays. """ def __init__(self, smiles: Union[np.ndarray, List[str]], mols: Union[np.ndarray, List[Mol]] = None, ids: Union[List, np.ndarray] = None, X: Union[List, np.ndarray] = None, feature_names: Union[List, np.ndarray] = None, y: Union[List, np.ndarray] = None, label_names: Union[List, np.ndarray] = None, mode: Union[str, List[str]] = 'auto') -> None: """ Initialize a dataset from SMILES strings. Parameters ---------- smiles: Union[np.ndarray, List[str]] SMILES strings of the molecules. mols: Union[np.ndarray, List[Mol]] RDKit Mol objects of the molecules. ids: Union[List, np.ndarray] IDs of the molecules. X: Union[List, np.ndarray] Features of the molecules. feature_names: Union[List, np.ndarray] Names of the features. y: Union[List, np.ndarray] Labels of the molecules. label_names: Union[List, np.ndarray] Names of the labels. If you have a single task this will be a list of length 1 with the name of the label. If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels. mode: Union[str, List[str]] The mode of the dataset. If 'auto', the mode is inferred from the labels. If 'classification', the dataset is treated as a classification dataset. If 'regression', the dataset is treated as a regression dataset. If list, the dataset is treated as a multi-task dataset. """ super().__init__() self._smiles = np.array(smiles) self._ids = np.array([str(i) for i in ids]) if ids is not None \ else np.array([str(uuid.uuid4().hex) for _ in range(len(smiles))]) self._original_ids = copy(self._ids) self._X = np.array(X) if X is not None else None self._y = np.array(y) if y is not None else None self._mols = np.array(mols) if mols is not None else np.array([smiles_to_mol(s) for s in self._smiles]) invalid = [self._ids[i] for i, m in enumerate(self._mols) if m is None] self._removed_elements = [] self.remove_elements(invalid, inplace=True) self._feature_names = np.array(feature_names) if feature_names is not None else None self._label_names = np.array(label_names) if label_names is not None else None self._validate_params() self._n_tasks = len(self._label_names) if self._label_names is not None else 0 self._mode = mode if mode != 'auto' else self._infer_mode() self.logger = Logger()
[docs] @classmethod def from_mols(cls, mols: Union[np.ndarray, List[Mol]], ids: Union[List, np.ndarray] = None, X: Union[List, np.ndarray] = None, feature_names: Union[List, np.ndarray] = None, y: Union[List, np.ndarray] = None, label_names: Union[List, np.ndarray] = None, mode: str = 'auto') -> 'SmilesDataset': """ Initialize a dataset from RDKit Mol objects. Parameters ---------- mols: Union[np.ndarray, List[Mol]] RDKit Mol objects of the molecules. ids: Union[List, np.ndarray] IDs of the molecules. X: Union[List, np.ndarray] Features of the molecules. feature_names: Union[List, np.ndarray] Names of the features. y: Union[List, np.ndarray] Labels of the molecules. label_names: Union[List, np.ndarray] Names of the labels. If you have a single task this will be a list of length 1 with the name of the label. If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels. mode: str The mode of the dataset. If 'auto', the mode is inferred from the labels. If 'classification', the dataset is treated as a classification dataset. If 'regression', the dataset is treated as a regression dataset. If 'multitask', the dataset is treated as a multitask dataset. Returns ------- SmilesDataset The dataset instance. """ smiles = np.array([mol_to_smiles(m) for m in mols]) return cls(smiles, mols, ids, X, feature_names, y, label_names, mode)
def __len__(self) -> int: """ Get the number of molecules in the dataset. Returns ------- int Number of molecules in the dataset. """ return len(self._smiles) def _validate_params(self) -> None: """ Validates the parameters of the dataset. """ if len(self._smiles) != len(self._ids): raise ValueError('Length of smiles and ids must be the same.') if self._X is not None and len(self._smiles) != len(self._X): raise ValueError('Length of smiles and X must be the same.') if self._y is not None and len(self._smiles) != len(self._y): raise ValueError('Length of smiles and y must be the same.') if self._feature_names is not None and self._X is not None: if len(self._X.shape) == 1: if len(self._feature_names) != 1: raise ValueError('Length of feature_names and X must be the same.') elif len(self._X.shape) == 2: if len(self._feature_names) != self._X.shape[1]: raise ValueError('Length of feature_names and X must be the same.') if self._feature_names is None and self._X is not None: if len(self._X.shape) == 1: self._feature_names = np.array(['feature_0']) elif len(self._X.shape) == 2: self._feature_names = np.array([f'feature_{i}' for i in range(self._X.shape[1])]) if self._label_names is not None and self._y is not None: if len(self._y.shape) == 1: if len(self._label_names) != 1: raise ValueError('Length of label_names and y must be the same.') elif len(self._y.shape) == 2: if len(self._label_names) != self._y.shape[1]: raise ValueError('Length of label_names and y must be the same.') if self._label_names is None and self._y is not None: if len(self._y.shape) == 1: self._label_names = np.array(['y']) elif len(self._y.shape) == 2: self._label_names = np.array([f'y_{i}' for i in range(self._y.shape[1])]) def _reset(self, smiles: Union[np.ndarray, List[str]]) -> None: """ Resets the dataset. Changes the smiles and updates the mols, ids, X and y. Parameters ---------- smiles: Union[np.ndarray, List[str]] SMILES strings of the new molecules. """ super().__init__() self._smiles = np.array(smiles) self._ids = np.array([str(uuid.uuid4().hex) for _ in range(len(smiles))]) self._original_ids = copy(self._ids) self._X = None self._y = None self._n_tasks = None self._removed_elements = [] self._mols = np.array([smiles_to_mol(s) for s in self._smiles]) self.remove_elements([self._ids[i] for i, m in enumerate(self._mols) if m is None], inplace=True) self._feature_names = None self._label_names = None self.mode = None def _infer_mode(self) -> Union[str, None, List[str]]: """ Infers the mode of the dataset. Returns ------- str The inferred mode. """ if self._y is None: return None if len(self._y.shape) > 1: self.logger.info("Assuming multitask since y has more than one dimension. If otherwise, explicitly set the " "mode to 'classification' or 'regression'!") labels_per_task = [] for label in range(self._y.shape[1]): label_i = self._y[:, label] classes = np.all(np.isclose(label_i, np.round(label_i), equal_nan=True)) if classes: labels_per_task.append('classification') else: labels_per_task.append('regression') return labels_per_task classes = np.all(np.isclose(self.y, np.round(self.y), equal_nan=True)) if not classes: self.logger.info("Assuming regression since there are more than 10 unique y values. If otherwise, " "explicitly set the mode to 'classification'!") return 'regression' else: self.logger.info("Assuming classification since there are less than 10 unique y values. If otherwise, " "explicitly set the mode to 'regression'!") return 'classification' @property def smiles(self) -> np.ndarray: """ Get the SMILES strings of the molecules in the dataset. Returns ------- np.ndarray SMILES strings of the molecules in the dataset. """ return self._smiles @smiles.setter def smiles(self, smiles: Union[np.ndarray, List[str]]) -> None: """ Set the SMILES strings of the molecules in the dataset. Parameters ---------- smiles: Union[np.ndarray, List[str]] SMILES strings of the molecules. """ warnings.warn('The RDKit Mol objects of the dataset will be updated, IDs updated and X and y deleted.') self._reset(smiles) @property def mols(self) -> np.ndarray: """ Get the RDKit Mol objects of the molecules in the dataset. Returns ------- np.ndarray RDKit molecules of the molecules in the dataset. """ return self._mols @property def feature_names(self) -> np.ndarray: """ Get the feature labels of the molecules in the dataset. Returns ------- np.ndarray Feature names of the molecules in the dataset. """ return self._feature_names @feature_names.setter def feature_names(self, feature_names: Union[List, np.ndarray]) -> None: """ Set the feature labels of the molecules in the dataset. Parameters ---------- feature_names: Union[List, np.ndarray] Feature names of the molecules. """ if self._X is None: raise ValueError('The features must be set before setting the feature names.') if len(self._X.shape) == 1: if len(feature_names) != 1: raise ValueError('The number of feature names must be equal to the number of features.') elif len(self._X.shape) == 2: if len(feature_names) != len(self._X[0]): raise ValueError('The number of feature names must be equal to the number of features.') elif len(self._X.shape) == 3: if len(feature_names) != len(self._X[0][0]): raise ValueError('The number of feature names must be equal to the number of features.') elif len(self._X.shape) == 4: if len(feature_names) != len(self._X[0][0]): # SmileImageFeat raise ValueError('The number of feature names must be equal to the number of features.') else: raise ValueError('The number of dimensions of X must be 1, 2 or 3.') if len(feature_names) != len(set(feature_names)): raise ValueError('The feature names must be unique.') self._feature_names = np.array([str(fn) for fn in feature_names]) @property def label_names(self) -> np.ndarray: """ Get the label names of the molecules in the dataset. If you have a single task this will be a list of length 1 with the name of the label. If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels. Returns ------- np.ndarray Label names in the dataset. """ return self._label_names @label_names.setter def label_names(self, label_names: Union[List, np.ndarray]) -> None: """ Set the label names of the dataset. If you have a single task this will be a list of length 1 with the name of the label. If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels. Parameters ---------- label_names: Union[List, np.ndarray] Label names of the dataset. """ if self._y is None: raise ValueError('The labels must be set before setting the label names.') if len(self._y.shape) == 1: if len(label_names) != 1: raise ValueError('The number of label names must be equal to the number of labels.') else: if len(label_names) != len(self._y[0]): raise ValueError('The number of label names must be equal to the number of labels.') if len(label_names) != len(set(label_names)): raise ValueError('The label names must be unique.') self._label_names = np.array([str(ln) for ln in label_names]) @deepmol_cached_property def X(self) -> np.ndarray: """ Get the features of the molecules in the dataset. Returns ------- np.ndarray Features of the molecules in the dataset. """ return self._X @property def y(self) -> np.ndarray: """ Get the labels of the molecules in the dataset. Returns ------- np.ndarray Labels of the molecules in the dataset. """ return self._y @property def ids(self) -> np.ndarray: """ Get the IDs of the molecules in the dataset. Returns ------- np.ndarray IDs of the molecules in the dataset. """ return self._ids @ids.setter def ids(self, ids: Union[List, np.ndarray]) -> None: """ Set the IDs of the molecules in the dataset. Parameters ---------- ids: Union[List, np.ndarray] IDs of the molecules. """ if len(ids) != len(self._smiles): raise ValueError('The number of IDs must be equal to the number of molecules.') if len(ids) != len(np.unique(ids)): raise ValueError('The IDs must be unique.') self._ids = np.array([str(idx) for idx in ids]) @property def n_tasks(self) -> int: """ Get the number of tasks in the dataset. Returns ------- n_tasks: int The number of tasks in the dataset. """ return self._n_tasks @property def mode(self) -> Union[str, List[str]]: """ Get the mode of the dataset. Returns ------- mode: Union[str, List[str]] The mode of the dataset. """ return self._mode @mode.setter def mode(self, mode: Union[str, List[str]]) -> None: """ Set the mode of the dataset. Parameters ---------- mode: Union[str, List[str]] The mode of the dataset. """ if not isinstance(mode, list): if mode not in ['classification', 'regression', 'multilabel', None]: raise ValueError('The mode must be either "classification" or "regression".') else: for m in mode: if m not in ['classification', 'regression', 'multilabel', None]: raise ValueError('The mode must be either "classification" or "regression".') self._mode = mode
[docs] def get_shape(self) -> Tuple[Tuple, Union[Tuple, None], Union[Tuple, None]]: """ Get the shape of the dataset. Returns three tuples, giving the shape of the smiles, X and y arrays. Returns ------- smiles_shape: Tuple The shape of the mols array. X_shape: Union[Tuple, None] The shape of the X array. y_shape: Union[Tuple, None] The shape of the y array. """ smiles_shape = self._smiles.shape self.logger.info(f'Mols_shape: {smiles_shape}') x_shape = self._X.shape if self._X is not None else None self.logger.info(f'Features_shape: {x_shape}') y_shape = self._y.shape if self._y is not None else None self.logger.info(f'Labels_shape: {y_shape}') return smiles_shape, x_shape, y_shape
@inplace_decorator def remove_duplicates(self) -> None: """ Remove molecules with duplicated features from the dataset. Parameters ---------- inplace: bool, optional (default False) If True, the dataset will be modified in place. """ if self._X is not None: if np.isnan(np.stack(self._X)).any(): warnings.warn('The dataset contains NaNs. Molecules with NaNs will be ignored.') unique, index = np.unique(self.X, return_index=True, axis=0) ids = self.ids[index] self.select(ids, axis=0, inplace=True) @property def removed_elements(self) -> np.ndarray: """ Get the molecules in the dataset. Returns ------- mols : np.ndarray Removed molecules in the dataset. """ return self._removed_elements @removed_elements.setter def removed_elements(self, value: Union[List[str], np.ndarray]) -> None: """ Set the molecules in the dataset. Parameters ---------- value: Union[List[str], np.ndarray] The removed elements in the dataset. """ self._removed_elements = value @inplace_decorator def remove_elements(self, ids: List[str]) -> None: """ Remove elements with specific IDs from the dataset. Parameters ---------- ids: List[str] IDs of the elements to remove. inplace: bool, optional (default False) If True, the dataset will be modified in place. """ if len(ids) != 0: all_indexes = self.ids positions = np.where(np.isin(self._original_ids, list(set(ids))))[0] self.removed_elements.extend(list(positions)) indexes_to_keep = list(set(all_indexes) - set(ids)) self.select(indexes_to_keep, inplace=True) @inplace_decorator def remove_elements_by_index(self, indexes: List[int]) -> None: """ Remove elements with specific indexes from the dataset. Parameters ---------- indexes: List[int] Indexes of the elements to remove. inplace: bool, optional (default False) If True, the dataset will be modified in place. """ if len(indexes) > 0: indexes = self._ids[indexes] self.remove_elements(indexes, inplace=True) @inplace_decorator def select_features_by_index(self, indexes: List[int]) -> 'SmilesDataset': """ Select features with specific indexes from the dataset Parameters ---------- indexes: List[int] The indexes of the features to select from the dataset. inplace: bool, optional (default False) If True, the dataset will be modified in place. """ if len(indexes) != 0: self.select(indexes, axis=1, inplace=True) self.clear_cached_properties() return self else: return self @inplace_decorator def select_features_by_name(self, names: List[str]) -> None: """ Select features with specific names from the dataset Parameters ---------- names: List[str] The names of the features to select from the dataset. inplace: bool, optional (default False) If True, the dataset will be modified in place. """ if len(names) != 0: # Get the indexes of the features to select indexes = [i for i, name in enumerate(self._feature_names) if name in names] self.select(indexes, axis=1, inplace=True) self.clear_cached_properties() @inplace_decorator def remove_nan(self, axis: int = 0) -> None: """ Remove samples with at least one NaN in the features (when axis = 0) Or remove samples with all features with NaNs and the features with at least one NaN (axis = 1) Parameters ---------- axis: int The axis to remove the NaNs from. inplace: bool, optional (default False) If True, the dataset will be modified in place. """ if self._X is None or len(self._X.shape) == 0: return if axis == 0: if len(self._X.shape) == 1: indexes = np.where(pd.isna(self._X))[0] else: indexes = np.where(pd.isna(self._X).any(axis=1))[0] # rows with at least one NaN self.remove_elements_by_index(indexes, inplace=True) elif axis == 1: if len(self._X.shape) == 1: indexes = np.where(np.isnan(self._X))[0] self.remove_elements_by_index(indexes, inplace=True) else: # rows with all NaNs indexes = np.where(np.isnan(self._X).all(axis=1))[0] self.remove_elements_by_index(indexes, inplace=True) # columns with at least one NaN columns = list(set(np.where(np.isnan(self._X).any(axis=0))[0])) self._X = np.delete(self._X, columns, axis=1) if len(self._X.shape) <= 2: # feature names in datasets with more than two dimensions not supported feature_names_to_delete = [self._feature_names[i] for i in columns] self._feature_names = [name for name in self._feature_names if name not in feature_names_to_delete] self.clear_cached_properties() else: raise ValueError('The axis must be 0 or 1.')
[docs] def select_to_split(self, indexes: Union[np.ndarray, List[int]]) -> 'SmilesDataset': """ Select elements with specific indexes to split the dataset Parameters ---------- indexes: Union[np.ndarray, List[int]] The indexes of the elements to split the dataset. Returns ------- SmilesDataset The dataset with the selected elements. """ smiles = self._smiles[indexes] mols = self._mols[indexes] X = self._X[indexes] if self._X is not None else None y = self._y[indexes] if self._y is not None else None ids = self._ids[indexes] feature_names = self._feature_names label_names = self._label_names mode = self._mode return SmilesDataset(smiles, mols, ids, X, feature_names, y, label_names, mode)
@inplace_decorator def select(self, ids: Union[List[str], List[int]], axis: int = 0) -> None: """ Creates a new sub dataset of self from a selection of indexes. Parameters ---------- ids: Union[List[str], List[int]] List of ids/indexes to select.IDs of the compounds in case axis = 0, indexes of the columns in case axis = 1. axis: int Axis to select along. 0 selects along the first axis, 1 selects along the second axis. inplace: bool, optional (default False) If True, the dataset will be modified in place. """ if axis == 0: ids_to_delete = sorted(list(set(self._ids) - set(ids))) raw_indexes = [i for i, mol_index in enumerate(self._ids) if mol_index in ids_to_delete] self._smiles = np.delete(self._smiles, raw_indexes, axis) self._mols = np.delete(self._mols, raw_indexes, axis) self._y = np.delete(self._y, raw_indexes, axis) if self._y is not None else self._y self._X = np.delete(self._X, raw_indexes, axis) if self._X is not None else self._X self._ids = np.delete(self._ids, raw_indexes, axis) elif axis == 1: if self._X is None or len(self._X.shape) == 0: raise ValueError('Dataset has no features.') if len(self._X.shape) == 1: pass else: indexes_to_delete = list(set(np.arange(self._X.shape[1])) - set(ids)) self._X = np.delete(self.X, indexes_to_delete, axis=1) if len(self._X.shape) <= 2: # feature names in datasets with more than two dimensions not supported feature_names_to_delete = [self._feature_names[i] for i in indexes_to_delete] self._feature_names = [name for name in self._feature_names if name not in feature_names_to_delete] else: raise ValueError('The axis must be 0 or 1.')
[docs] def merge(self, datasets: List[Dataset]) -> 'SmilesDataset': """ Merges provided datasets with the self dataset. Parameters ---------- datasets: List[Dataset] List of datasets to merge. Returns ------- NumpyDataset A merged NumpyDataset. """ datasets = list(datasets) X = self._X y = self._y ids = self._ids mols = self._mols smiles = self._smiles feature_names = self._feature_names label_names = self._label_names mode = self._mode for ds in datasets: ids = merge_arrays(ids, len(mols), ds.ids, len(ds.mols)) if len(set(ids)) != len(ids): raise ValueError(f'IDs must be unique! IDs are {ids}') y = merge_arrays(y, len(mols), ds.y, len(ds.mols)) if X is None or ds.X is None: self.logger.error('Features are not the same length/type... Recalculate features for all inputs!') X = None elif len(X.shape) == 1 and len(ds.X.shape) == 1: X = merge_arrays(X, len(mols), ds.X, len(ds.mols)) else: X = merge_arrays_of_arrays(X, ds.X) mols = np.append(mols, ds.mols, axis=0) smiles = np.append(smiles, ds.smiles, axis=0) return SmilesDataset(smiles, mols, ids, X, feature_names, y, label_names, mode)
[docs] def to_dataframe(self): """ Convert data into dataframe """ df = pd.DataFrame() df['ids'] = pd.Series(self._ids) df['smiles'] = pd.Series(self._smiles) if self._y is not None: label_names = self._label_names df_y = pd.DataFrame(self._y, columns=label_names) df = pd.concat([df, df_y], axis=1) if self._X is not None: columns_names = self._feature_names df_x = pd.DataFrame(self._X, columns=columns_names) df = pd.concat([df, df_x], axis=1) return df
[docs] def to_csv(self, path: str, **kwargs) -> None: """ Save the dataset to a csv file. Parameters ---------- path: str Path to save the csv file. """ df = self.to_dataframe() df.to_csv(path, **kwargs)
[docs] def to_sdf(self, path: str) -> None: """ Save the dataset to a sdf file. Parameters ---------- path: str Path to save the sdf file. """ mol_set = self.mols writer = SDWriter(path) for i, mol in enumerate(mol_set): if self.y is not None and self.y.size > 0: if len(self.y.shape) > 1 and self.y.shape[1] > 1: label = self.y[i, :] for j, class_name in enumerate(self.label_names): mol.SetProp(class_name, "%f" % label[j]) elif len(self.y.shape) > 1 and self.y.shape[1] == 1: class_name = self.label_names[0] label = self.y[i, 0] mol.SetProp(class_name, "%f" % label) else: class_name = self.label_names[0] label = self.y[i] mol.SetProp(class_name, "%f" % label) if self.ids is not None and self.ids.size > 0: mol_id = self.ids[i] mol.SetProp("_ID", f"{mol_id}") writer.write(mol) writer.close()
@inplace_decorator def load_features(self, path: str, **kwargs) -> None: """ Load features from a csv file. Parameters ---------- path: str Path to the csv file. inplace: bool, optional (default False) If True, the dataset will be modified in place. kwargs: Keyword arguments to pass to pandas.read_csv. """ df = pd.read_csv(path, **kwargs) self._X = df.to_numpy()
[docs] def save_features(self, path: str = 'features.csv') -> None: """ Save the features to a csv file. Parameters ---------- path: str Path to save the csv file. """ if self.X is not None: columns_names = self._feature_names df = pd.DataFrame(self._X, columns=columns_names) df.to_csv(path, index=False) else: raise ValueError('Features array is empty!')