Source code for deepmol.datasets.datasets

import uuid
import warnings
from abc import ABC, abstractmethod
from copy import copy, deepcopy
from typing import Union, List, Tuple

import numpy as np
import pandas as pd
from rdkit.Chem import Mol, SDWriter

from deepmol.loggers.logger import Logger
from deepmol.datasets._utils import merge_arrays, merge_arrays_of_arrays
from deepmol.utils.cached_properties import deepmol_cached_property
from deepmol.utils.decorators import inplace_decorator
from deepmol.utils.utils import smiles_to_mol, mol_to_smiles


[docs]class Dataset(ABC):
    """
    Abstract base class for datasets
    Subclasses need to implement their own methods based on this class.

    """

    def __init__(self):
        self.logger = Logger()

[docs]    def clear_cached_properties(self):
        """
        Clears the cached properties of the class.
        """
        for name in dir(type(self)):
            if isinstance(getattr(type(self), name), deepmol_cached_property):
                vars(self).pop(name, None)

    @abstractmethod
    def __len__(self) -> int:
        """
        Get the length of the dataset.
        It returns the number of molecules in the dataset.
        """

    @property
    @abstractmethod
    def smiles(self) -> np.ndarray:
        """
        Get the smiles in the dataset.
        Returns
        -------
        mols : np.ndarray
            Molecule smiles in the dataset.
        """

    @smiles.setter
    @abstractmethod
    def smiles(self, value: Union[List[str], np.ndarray]) -> None:
        """
        Set the molecules in the dataset.
        Parameters
        ----------
        value: Union[List[str], np.ndarray]
            The molecules to set in the dataset.
        """

    @property
    @abstractmethod
    def mols(self) -> np.ndarray:
        """
        Get the molecules in the dataset.

        Returns
        -------
        mols : np.ndarray
            Molecules in the dataset.
        """

    @property
    @abstractmethod
    def removed_elements(self) -> np.ndarray:
        """
        Get the molecules in the dataset.

        Returns
        -------
        mols : np.ndarray
            Removed molecules in the dataset.
        """
    
    @removed_elements.setter
    @abstractmethod
    def removed_elements(self, value: Union[List[str], np.ndarray]) -> None:
        """
        Set the molecules in the dataset.
        Parameters
        ----------
        value: Union[List[str], np.ndarray]
            The removed elements in the dataset.
        """


    @mols.setter
    @abstractmethod
    def mols(self, value: Union[List[str], np.ndarray]) -> None:
        """
        Set the molecules in the dataset.

        Parameters
        ----------
        value: Union[List[str], np.ndarray]
            The molecules to set in the dataset.
        """

    @property
    @abstractmethod
    def X(self) -> np.ndarray:
        """
        Get the features in the dataset.

        Returns
        -------
        X: np.ndarray
            The features in the dataset.
        """

    @property
    @abstractmethod
    def y(self) -> np.ndarray:
        """
        Get the labels in the dataset.

        Returns
        -------
        y: np.ndarray
            The labels in the dataset.
        """

    @y.setter
    @abstractmethod
    def y(self, value: Union[List, np.ndarray]) -> None:
        """
        Set the labels in the dataset.

        Parameters
        ----------
        value: Union[List, np.ndarray]
            The labels to set in the dataset.
        """

    @property
    @abstractmethod
    def ids(self) -> np.ndarray:
        """
        Get the ids in the dataset.

        Returns
        -------
        ids: np.ndarray
            The ids in the dataset.
        """

    @ids.setter
    @abstractmethod
    def ids(self, value: Union[List, np.ndarray]) -> None:
        """
        Set the ids in the dataset.

        Parameters
        ----------
        value: Union[List[str], np.ndarray]
            The ids to set in the dataset.
        """

    @property
    @abstractmethod
    def feature_names(self) -> np.ndarray:
        """
        Get the feature labels of the molecules in the dataset.

        Returns
        -------
        feature_names: np.ndarray
            Feature names of the molecules.
        """

    @feature_names.setter
    @abstractmethod
    def feature_names(self, value: Union[List, np.ndarray]) -> None:
        """
        Set the feature labels of the molecules in the dataset.

        Parameters
        ----------
        value: Union[List, np.ndarray]
            Feature names of the molecules.
        """

    @property
    @abstractmethod
    def label_names(self) -> np.ndarray:
        """
        Get the labels names of the dataset.
        If you have a single task this will be a list of length 1 with the name of the label.
        If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels.

        Returns
        -------
        label_names: np.ndarray
            Label names of the molecules.
        """

    @label_names.setter
    @abstractmethod
    def label_names(self, value: Union[List, np.ndarray]) -> None:
        """
        Set the labels names of the dataset.
        If you have a single task this will be a list of length 1 with the name of the label.
        If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels.

        Parameters
        ----------
        value: Union[List, np.ndarray]
            Label names of the molecules.
        """

    @property
    @abstractmethod
    def n_tasks(self) -> int:
        """
        Get the number of tasks in the dataset.

        Returns
        -------
        n_tasks: int
            The number of tasks in the dataset.
        """

    @n_tasks.setter
    @abstractmethod
    def n_tasks(self, value: int) -> None:
        """
        Set the number of tasks in the dataset.

        Parameters
        ----------
        value: int
            The number of tasks in the dataset.

        """

    @property
    @abstractmethod
    def mode(self) -> Union[str, List[str]]:
        """
        Get the mode of the dataset.

        Returns
        -------
        mode: Union[str, List[str]]
            The mode of the dataset.
        """

    @mode.setter
    def mode(self, value: Union[str, List[str]]) -> None:
        """
        Set the mode of the dataset.

        Parameters
        ----------
        value: Union[str, List[str]]
            The mode of the dataset.
        """

[docs]    @abstractmethod
    def get_shape(self) -> tuple:
        """
        Get the shape of molecules, features and labels in the dataset.

        Returns
        -------
        shape: tuple
            The shape of molecules, features and labels.
        """

[docs]    @abstractmethod
    def remove_nan(self, axis: int = 0) -> None:
        """
        Remove the nan values from the dataset.

        Parameters
        ----------
        axis: int
            The axis to remove the nan values.
        """

[docs]    @abstractmethod
    def remove_elements(self, indexes: List) -> None:
        """
        Remove the elements from the dataset.

        Parameters
        ----------
        indexes: List[int]
            The indexes of the elements to remove.
        """

[docs]    @abstractmethod
    def select_features_by_index(self, indexes: List[int]) -> 'Dataset':
        """
        Select the features from the dataset.
        Parameters
        ----------
        indexes: List[int]
            The indexes of the features to select.
        """

[docs]    @abstractmethod
    def select_features_by_name(self, names: List[str]) -> None:
        """
        Select features with specific names from the dataset
        Parameters
        ----------
        names: List[str]
            The names of the features to select from the dataset.
        """

[docs]    @abstractmethod
    def select(self, indexes: List[int], axis: int = 0) -> None:
        """
        Select the elements from the dataset.

        Parameters
        ----------
        indexes: List[int]
            The indexes of the elements to select.
        axis: int
            The axis to select the elements.
        """

[docs]    @abstractmethod
    def select_to_split(self, indexes: Union[np.ndarray, List[int]]) -> 'Dataset':
        """
        Select the elements from the dataset to split.

        Parameters
        ----------
        indexes: Union[np.ndarray, List[int]]
            The indexes of the elements to select.
        """


[docs]class SmilesDataset(Dataset):
    """
    A Dataset defined by in-memory numpy arrays.
    This subclass of 'Dataset' stores arrays for smiles strings, Mol objects, features X, labels y, and molecule ids in
    memory as numpy arrays.
    """

    def __init__(self,
                 smiles: Union[np.ndarray, List[str]],
                 mols: Union[np.ndarray, List[Mol]] = None,
                 ids: Union[List, np.ndarray] = None,
                 X: Union[List, np.ndarray] = None,
                 feature_names: Union[List, np.ndarray] = None,
                 y: Union[List, np.ndarray] = None,
                 label_names: Union[List, np.ndarray] = None,
                 mode: Union[str, List[str]] = 'auto') -> None:
        """
        Initialize a dataset from SMILES strings.

        Parameters
        ----------
        smiles: Union[np.ndarray, List[str]]
            SMILES strings of the molecules.
        mols: Union[np.ndarray, List[Mol]]
            RDKit Mol objects of the molecules.
        ids: Union[List, np.ndarray]
            IDs of the molecules.
        X: Union[List, np.ndarray]
            Features of the molecules.
        feature_names: Union[List, np.ndarray]
            Names of the features.
        y: Union[List, np.ndarray]
            Labels of the molecules.
        label_names: Union[List, np.ndarray]
            Names of the labels. If you have a single task this will be a list of length 1 with the name of the label.
            If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels.
        mode: Union[str, List[str]]
            The mode of the dataset.
            If 'auto', the mode is inferred from the labels. If 'classification', the dataset is treated as a
            classification dataset. If 'regression', the dataset is treated as a regression dataset. If list, the
            dataset is treated as a multi-task dataset.
        """
        super().__init__()
        self._smiles = np.array(smiles)
        self._ids = np.array([str(i) for i in ids]) if ids is not None \
            else np.array([str(uuid.uuid4().hex) for _ in range(len(smiles))])
        self._original_ids = copy(self._ids)
        self._X = np.array(X) if X is not None else None
        self._y = np.array(y) if y is not None else None
        self._mols = np.array(mols) if mols is not None else np.array([smiles_to_mol(s) for s in self._smiles])
        invalid = [self._ids[i] for i, m in enumerate(self._mols) if m is None]
        self._removed_elements = []
        self.remove_elements(invalid, inplace=True)
        self._feature_names = np.array(feature_names) if feature_names is not None else None
        self._label_names = np.array(label_names) if label_names is not None else None
        self._validate_params()
        self._n_tasks = len(self._label_names) if self._label_names is not None else 0
        self._mode = mode if mode != 'auto' else self._infer_mode()
        self.logger = Logger()

[docs]    @classmethod
    def from_mols(cls,
                  mols: Union[np.ndarray, List[Mol]],
                  ids: Union[List, np.ndarray] = None,
                  X: Union[List, np.ndarray] = None,
                  feature_names: Union[List, np.ndarray] = None,
                  y: Union[List, np.ndarray] = None,
                  label_names: Union[List, np.ndarray] = None,
                  mode: str = 'auto') -> 'SmilesDataset':
        """
        Initialize a dataset from RDKit Mol objects.

        Parameters
        ----------
        mols: Union[np.ndarray, List[Mol]]
            RDKit Mol objects of the molecules.
        ids: Union[List, np.ndarray]
            IDs of the molecules.
        X: Union[List, np.ndarray]
            Features of the molecules.
        feature_names: Union[List, np.ndarray]
            Names of the features.
        y: Union[List, np.ndarray]
            Labels of the molecules.
        label_names: Union[List, np.ndarray]
            Names of the labels. If you have a single task this will be a list of length 1 with the name of the label.
            If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels.
        mode: str
            The mode of the dataset.
            If 'auto', the mode is inferred from the labels. If 'classification', the dataset is treated as a
            classification dataset. If 'regression', the dataset is treated as a regression dataset. If 'multitask',
            the dataset is treated as a multitask dataset.

        Returns
        -------
        SmilesDataset
            The dataset instance.
        """
        smiles = np.array([mol_to_smiles(m) for m in mols])
        return cls(smiles, mols, ids, X, feature_names, y, label_names, mode)

    def __len__(self) -> int:
        """
        Get the number of molecules in the dataset.
        Returns
        -------
        int
            Number of molecules in the dataset.
        """
        return len(self._smiles)

    def _validate_params(self) -> None:
        """
        Validates the parameters of the dataset.
        """
        if len(self._smiles) != len(self._ids):
            raise ValueError('Length of smiles and ids must be the same.')
        if self._X is not None and len(self._smiles) != len(self._X):
            raise ValueError('Length of smiles and X must be the same.')
        if self._y is not None and len(self._smiles) != len(self._y):
            raise ValueError('Length of smiles and y must be the same.')
        if self._feature_names is not None and self._X is not None:
            if len(self._X.shape) == 1:
                if len(self._feature_names) != 1:
                    raise ValueError('Length of feature_names and X must be the same.')
            elif len(self._X.shape) == 2:
                if len(self._feature_names) != self._X.shape[1]:
                    raise ValueError('Length of feature_names and X must be the same.')
        if self._feature_names is None and self._X is not None:
            if len(self._X.shape) == 1:
                self._feature_names = np.array(['feature_0'])
            elif len(self._X.shape) == 2:
                self._feature_names = np.array([f'feature_{i}' for i in range(self._X.shape[1])])
        if self._label_names is not None and self._y is not None:
            if len(self._y.shape) == 1:
                if len(self._label_names) != 1:
                    raise ValueError('Length of label_names and y must be the same.')
            elif len(self._y.shape) == 2:
                if len(self._label_names) != self._y.shape[1]:
                    raise ValueError('Length of label_names and y must be the same.')
        if self._label_names is None and self._y is not None:
            if len(self._y.shape) == 1:
                self._label_names = np.array(['y'])
            elif len(self._y.shape) == 2:
                self._label_names = np.array([f'y_{i}' for i in range(self._y.shape[1])])

    def _reset(self, smiles: Union[np.ndarray, List[str]]) -> None:
        """
        Resets the dataset.
        Changes the smiles and updates the mols, ids, X and y.
        Parameters
        ----------
        smiles: Union[np.ndarray, List[str]]
            SMILES strings of the new molecules.
        """
        super().__init__()
        self._smiles = np.array(smiles)
        self._ids = np.array([str(uuid.uuid4().hex) for _ in range(len(smiles))])
        self._original_ids = copy(self._ids)
        self._X = None
        self._y = None
        self._n_tasks = None
        self._removed_elements = []
        self._mols = np.array([smiles_to_mol(s) for s in self._smiles])
        self.remove_elements([self._ids[i] for i, m in enumerate(self._mols) if m is None], inplace=True)
        self._feature_names = None
        self._label_names = None
        self.mode = None

    def _infer_mode(self) -> Union[str, None, List[str]]:
        """
        Infers the mode of the dataset.

        Returns
        -------
        str
            The inferred mode.
        """
        if self._y is None:
            return None
        if len(self._y.shape) > 1:
            self.logger.info("Assuming multitask since y has more than one dimension. If otherwise, explicitly set the "
                             "mode to 'classification' or 'regression'!")
            labels_per_task = []
            for label in range(self._y.shape[1]):
                label_i = self._y[:, label]
                classes = np.all(np.isclose(label_i, np.round(label_i), equal_nan=True))
                if classes:
                    labels_per_task.append('classification')
                else:
                    labels_per_task.append('regression')

            return labels_per_task

        classes = np.all(np.isclose(self.y, np.round(self.y), equal_nan=True))
        if not classes:
            self.logger.info("Assuming regression since there are more than 10 unique y values. If otherwise, "
                             "explicitly set the mode to 'classification'!")
            return 'regression'
        else:
            self.logger.info("Assuming classification since there are less than 10 unique y values. If otherwise, "
                             "explicitly set the mode to 'regression'!")
            return 'classification'

    @property
    def smiles(self) -> np.ndarray:
        """
        Get the SMILES strings of the molecules in the dataset.
        Returns
        -------
        np.ndarray
            SMILES strings of the molecules in the dataset.
        """
        return self._smiles

    @smiles.setter
    def smiles(self, smiles: Union[np.ndarray, List[str]]) -> None:
        """
        Set the SMILES strings of the molecules in the dataset.
        Parameters
        ----------
        smiles: Union[np.ndarray, List[str]]
            SMILES strings of the molecules.
        """
        warnings.warn('The RDKit Mol objects of the dataset will be updated, IDs updated and X and y deleted.')
        self._reset(smiles)

    @property
    def mols(self) -> np.ndarray:
        """
        Get the RDKit Mol objects of the molecules in the dataset.
        Returns
        -------
        np.ndarray
            RDKit molecules of the molecules in the dataset.
        """
        return self._mols

    @property
    def feature_names(self) -> np.ndarray:
        """
        Get the feature labels of the molecules in the dataset.
        Returns
        -------
        np.ndarray
            Feature names of the molecules in the dataset.
        """
        return self._feature_names

    @feature_names.setter
    def feature_names(self, feature_names: Union[List, np.ndarray]) -> None:
        """
        Set the feature labels of the molecules in the dataset.
        Parameters
        ----------
        feature_names: Union[List, np.ndarray]
            Feature names of the molecules.
        """
        if self._X is None:
            raise ValueError('The features must be set before setting the feature names.')
        if len(self._X.shape) == 1:
            if len(feature_names) != 1:
                raise ValueError('The number of feature names must be equal to the number of features.')
        elif len(self._X.shape) == 2:
            if len(feature_names) != len(self._X[0]):
                raise ValueError('The number of feature names must be equal to the number of features.')
        elif len(self._X.shape) == 3:
            if len(feature_names) != len(self._X[0][0]):
                raise ValueError('The number of feature names must be equal to the number of features.')
        elif len(self._X.shape) == 4:
            if len(feature_names) != len(self._X[0][0]):  # SmileImageFeat
                raise ValueError('The number of feature names must be equal to the number of features.')
        else:
            raise ValueError('The number of dimensions of X must be 1, 2 or 3.')
        if len(feature_names) != len(set(feature_names)):
            raise ValueError('The feature names must be unique.')
        self._feature_names = np.array([str(fn) for fn in feature_names])

    @property
    def label_names(self) -> np.ndarray:
        """
        Get the label names of the molecules in the dataset.
        If you have a single task this will be a list of length 1 with the name of the label.
        If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels.

        Returns
        -------
        np.ndarray
            Label names in the dataset.
        """
        return self._label_names

    @label_names.setter
    def label_names(self, label_names: Union[List, np.ndarray]) -> None:
        """
        Set the label names of the dataset.
        If you have a single task this will be a list of length 1 with the name of the label.
        If you have a multi-task dataset this will be a list of length n_tasks with the names of the labels.

        Parameters
        ----------
        label_names: Union[List, np.ndarray]
            Label names of the dataset.
        """
        if self._y is None:
            raise ValueError('The labels must be set before setting the label names.')
        if len(self._y.shape) == 1:
            if len(label_names) != 1:
                raise ValueError('The number of label names must be equal to the number of labels.')
        else:
            if len(label_names) != len(self._y[0]):
                raise ValueError('The number of label names must be equal to the number of labels.')
        if len(label_names) != len(set(label_names)):
            raise ValueError('The label names must be unique.')
        self._label_names = np.array([str(ln) for ln in label_names])

    @deepmol_cached_property
    def X(self) -> np.ndarray:
        """
        Get the features of the molecules in the dataset.
        Returns
        -------
        np.ndarray
            Features of the molecules in the dataset.
        """
        return self._X

    @property
    def y(self) -> np.ndarray:
        """
        Get the labels of the molecules in the dataset.
        Returns
        -------
        np.ndarray
            Labels of the molecules in the dataset.
        """
        return self._y

    @property
    def ids(self) -> np.ndarray:
        """
        Get the IDs of the molecules in the dataset.
        Returns
        -------
        np.ndarray
            IDs of the molecules in the dataset.
        """
        return self._ids

    @ids.setter
    def ids(self, ids: Union[List, np.ndarray]) -> None:
        """
        Set the IDs of the molecules in the dataset.
        Parameters
        ----------
        ids: Union[List, np.ndarray]
            IDs of the molecules.
        """
        if len(ids) != len(self._smiles):
            raise ValueError('The number of IDs must be equal to the number of molecules.')
        if len(ids) != len(np.unique(ids)):
            raise ValueError('The IDs must be unique.')
        self._ids = np.array([str(idx) for idx in ids])

    @property
    def n_tasks(self) -> int:
        """
        Get the number of tasks in the dataset.

        Returns
        -------
        n_tasks: int
            The number of tasks in the dataset.
        """
        return self._n_tasks

    @property
    def mode(self) -> Union[str, List[str]]:
        """
        Get the mode of the dataset.
        Returns
        -------
        mode: Union[str, List[str]]
            The mode of the dataset.
        """
        return self._mode

    @mode.setter
    def mode(self, mode: Union[str, List[str]]) -> None:
        """
        Set the mode of the dataset.

        Parameters
        ----------
        mode: Union[str, List[str]]
            The mode of the dataset.
        """
        if not isinstance(mode, list):
            if mode not in ['classification', 'regression', 'multilabel', None]:
                raise ValueError('The mode must be either "classification" or "regression".')
        else:
            for m in mode:
                if m not in ['classification', 'regression', 'multilabel', None]:
                    raise ValueError('The mode must be either "classification" or "regression".')
        self._mode = mode

[docs]    def get_shape(self) -> Tuple[Tuple, Union[Tuple, None], Union[Tuple, None]]:
        """
        Get the shape of the dataset.
        Returns three tuples, giving the shape of the smiles, X and y arrays.

        Returns
        -------
        smiles_shape: Tuple
            The shape of the mols array.
        X_shape: Union[Tuple, None]
            The shape of the X array.
        y_shape: Union[Tuple, None]
            The shape of the y array.
        """
        smiles_shape = self._smiles.shape
        self.logger.info(f'Mols_shape: {smiles_shape}')
        x_shape = self._X.shape if self._X is not None else None
        self.logger.info(f'Features_shape: {x_shape}')
        y_shape = self._y.shape if self._y is not None else None
        self.logger.info(f'Labels_shape: {y_shape}')
        return smiles_shape, x_shape, y_shape

    @inplace_decorator
    def remove_duplicates(self) -> None:
        """
        Remove molecules with duplicated features from the dataset.

        Parameters
        ----------
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        """
        if self._X is not None:
            if np.isnan(np.stack(self._X)).any():
                warnings.warn('The dataset contains NaNs. Molecules with NaNs will be ignored.')
            unique, index = np.unique(self.X, return_index=True, axis=0)
            ids = self.ids[index]
            self.select(ids, axis=0, inplace=True)

    @property
    def removed_elements(self) -> np.ndarray:
        """
        Get the molecules in the dataset.

        Returns
        -------
        mols : np.ndarray
            Removed molecules in the dataset.
        """
        return self._removed_elements
    
    @removed_elements.setter
    def removed_elements(self, value: Union[List[str], np.ndarray]) -> None:
        """
        Set the molecules in the dataset.
        Parameters
        ----------
        value: Union[List[str], np.ndarray]
            The removed elements in the dataset.
        """
        self._removed_elements = value


    @inplace_decorator
    def remove_elements(self, ids: List[str]) -> None:
        """
        Remove elements with specific IDs from the dataset.
        Parameters
        ----------
        ids: List[str]
            IDs of the elements to remove.
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        """
        if len(ids) != 0:
            all_indexes = self.ids
            positions = np.where(np.isin(self._original_ids, list(set(ids))))[0]
            self.removed_elements.extend(list(positions))
            indexes_to_keep = list(set(all_indexes) - set(ids))
            self.select(indexes_to_keep, inplace=True)

    @inplace_decorator
    def remove_elements_by_index(self, indexes: List[int]) -> None:
        """
        Remove elements with specific indexes from the dataset.
        Parameters
        ----------
        indexes: List[int]
            Indexes of the elements to remove.
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        """
        if len(indexes) > 0:
            indexes = self._ids[indexes]
            self.remove_elements(indexes, inplace=True)

    @inplace_decorator
    def select_features_by_index(self, indexes: List[int]) -> 'SmilesDataset':
        """
        Select features with specific indexes from the dataset
        Parameters
        ----------
        indexes: List[int]
            The indexes of the features to select from the dataset.
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        """
        if len(indexes) != 0:
            self.select(indexes, axis=1, inplace=True)
            self.clear_cached_properties()
            return self
        else:
            return self

    @inplace_decorator
    def select_features_by_name(self, names: List[str]) -> None:
        """
        Select features with specific names from the dataset
        Parameters
        ----------
        names: List[str]
            The names of the features to select from the dataset.
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        """
        if len(names) != 0:
            # Get the indexes of the features to select
            indexes = [i for i, name in enumerate(self._feature_names) if name in names]
            self.select(indexes, axis=1, inplace=True)
            self.clear_cached_properties()

    @inplace_decorator
    def remove_nan(self, axis: int = 0) -> None:
        """
        Remove samples with at least one NaN in the features (when axis = 0)
        Or remove samples with all features with NaNs and the features with at least one NaN (axis = 1)
        Parameters
        ----------
        axis: int
            The axis to remove the NaNs from.
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        """
        if self._X is None or len(self._X.shape) == 0:
            return
        if axis == 0:
            if len(self._X.shape) == 1:
                indexes = np.where(pd.isna(self._X))[0]
            else:
                indexes = np.where(pd.isna(self._X).any(axis=1))[0]
            # rows with at least one NaN
            self.remove_elements_by_index(indexes, inplace=True)
        elif axis == 1:
            if len(self._X.shape) == 1:
                indexes = np.where(np.isnan(self._X))[0]
                self.remove_elements_by_index(indexes, inplace=True)
            else:
                # rows with all NaNs
                indexes = np.where(np.isnan(self._X).all(axis=1))[0]
                self.remove_elements_by_index(indexes, inplace=True)
                # columns with at least one NaN
                columns = list(set(np.where(np.isnan(self._X).any(axis=0))[0]))
                self._X = np.delete(self._X, columns, axis=1)
                if len(self._X.shape) <= 2:  # feature names in datasets with more than two dimensions not supported
                    feature_names_to_delete = [self._feature_names[i] for i in columns]
                    self._feature_names = [name for name in self._feature_names if name not in feature_names_to_delete]

                self.clear_cached_properties()
        else:
            raise ValueError('The axis must be 0 or 1.')

[docs]    def select_to_split(self, indexes: Union[np.ndarray, List[int]]) -> 'SmilesDataset':
        """
        Select elements with specific indexes to split the dataset
        Parameters
        ----------
        indexes: Union[np.ndarray, List[int]]
            The indexes of the elements to split the dataset.
        Returns
        -------
        SmilesDataset
            The dataset with the selected elements.
        """
        smiles = self._smiles[indexes]
        mols = self._mols[indexes]
        X = self._X[indexes] if self._X is not None else None
        y = self._y[indexes] if self._y is not None else None
        ids = self._ids[indexes]
        feature_names = self._feature_names
        label_names = self._label_names
        mode = self._mode
        return SmilesDataset(smiles, mols, ids, X, feature_names, y, label_names, mode)

    @inplace_decorator
    def select(self, ids: Union[List[str], List[int]], axis: int = 0) -> None:
        """
        Creates a new sub dataset of self from a selection of indexes.

        Parameters
        ----------
        ids: Union[List[str], List[int]]
          List of ids/indexes to select.IDs of the compounds in case axis = 0,
          indexes of the columns in case axis = 1.
        axis: int
            Axis to select along. 0 selects along the first axis, 1 selects along the second axis.
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        """
        if axis == 0:
            ids_to_delete = sorted(list(set(self._ids) - set(ids)))
            raw_indexes = [i for i, mol_index in enumerate(self._ids) if mol_index in ids_to_delete]
            self._smiles = np.delete(self._smiles, raw_indexes, axis)
            self._mols = np.delete(self._mols, raw_indexes, axis)
            self._y = np.delete(self._y, raw_indexes, axis) if self._y is not None else self._y
            self._X = np.delete(self._X, raw_indexes, axis) if self._X is not None else self._X
            self._ids = np.delete(self._ids, raw_indexes, axis)

        elif axis == 1:
            if self._X is None or len(self._X.shape) == 0:
                raise ValueError('Dataset has no features.')
            if len(self._X.shape) == 1:
                pass
            else:
                indexes_to_delete = list(set(np.arange(self._X.shape[1])) - set(ids))
                self._X = np.delete(self.X, indexes_to_delete, axis=1)
                if len(self._X.shape) <= 2:  # feature names in datasets with more than two dimensions not supported
                    feature_names_to_delete = [self._feature_names[i] for i in indexes_to_delete]
                    self._feature_names = [name for name in self._feature_names if name not in feature_names_to_delete]
        else:
            raise ValueError('The axis must be 0 or 1.')

[docs]    def merge(self, datasets: List[Dataset]) -> 'SmilesDataset':
        """
        Merges provided datasets with the self dataset.
        Parameters
        ----------
        datasets: List[Dataset]
            List of datasets to merge.
        Returns
        -------
        NumpyDataset
            A merged NumpyDataset.
        """
        datasets = list(datasets)

        X = self._X
        y = self._y
        ids = self._ids
        mols = self._mols
        smiles = self._smiles
        feature_names = self._feature_names
        label_names = self._label_names
        mode = self._mode

        for ds in datasets:
            ids = merge_arrays(ids, len(mols), ds.ids, len(ds.mols))
            if len(set(ids)) != len(ids):
                raise ValueError(f'IDs must be unique! IDs are {ids}')
            y = merge_arrays(y, len(mols), ds.y, len(ds.mols))
            if X is None or ds.X is None:
                self.logger.error('Features are not the same length/type... Recalculate features for all inputs!')
                X = None
            elif len(X.shape) == 1 and len(ds.X.shape) == 1:
                X = merge_arrays(X, len(mols), ds.X, len(ds.mols))
            else:
                X = merge_arrays_of_arrays(X, ds.X)
            mols = np.append(mols, ds.mols, axis=0)
            smiles = np.append(smiles, ds.smiles, axis=0)
        return SmilesDataset(smiles, mols, ids, X, feature_names, y, label_names, mode)
    
[docs]    def to_dataframe(self):
        """
        Convert data into dataframe
        """ 
        df = pd.DataFrame()
        df['ids'] = pd.Series(self._ids)
        df['smiles'] = pd.Series(self._smiles)
        if self._y is not None:
            label_names = self._label_names
            df_y = pd.DataFrame(self._y, columns=label_names)
            df = pd.concat([df, df_y], axis=1)
        if self._X is not None:
            columns_names = self._feature_names
            df_x = pd.DataFrame(self._X, columns=columns_names)
            df = pd.concat([df, df_x], axis=1)
        return df

[docs]    def to_csv(self, path: str, **kwargs) -> None:
        """
        Save the dataset to a csv file.
        Parameters
        ----------
        path: str
            Path to save the csv file.
        """
        df = self.to_dataframe()

        df.to_csv(path, **kwargs)

[docs]    def to_sdf(self, path: str) -> None:
        """
        Save the dataset to a sdf file.
        Parameters
        ----------
        path: str
            Path to save the sdf file.
        """
        mol_set = self.mols
        writer = SDWriter(path)

        for i, mol in enumerate(mol_set):
            if self.y is not None and self.y.size > 0:
                if len(self.y.shape) > 1 and self.y.shape[1] > 1:
                    label = self.y[i, :]
                    for j, class_name in enumerate(self.label_names):
                        mol.SetProp(class_name, "%f" % label[j])
                elif len(self.y.shape) > 1 and self.y.shape[1] == 1:
                    class_name = self.label_names[0]
                    label = self.y[i, 0]
                    mol.SetProp(class_name, "%f" % label)

                else:
                    class_name = self.label_names[0]
                    label = self.y[i]
                    mol.SetProp(class_name, "%f" % label)

            if self.ids is not None and self.ids.size > 0:
                mol_id = self.ids[i]
                mol.SetProp("_ID", f"{mol_id}")
            writer.write(mol)

        writer.close()

    @inplace_decorator
    def load_features(self, path: str, **kwargs) -> None:
        """
        Load features from a csv file.
        Parameters
        ----------
        path: str
            Path to the csv file.
        inplace: bool, optional (default False)
            If True, the dataset will be modified in place.
        kwargs:
            Keyword arguments to pass to pandas.read_csv.
        """
        df = pd.read_csv(path, **kwargs)
        self._X = df.to_numpy()

[docs]    def save_features(self, path: str = 'features.csv') -> None:
        """
        Save the features to a csv file.
        Parameters
        ----------
        path: str
            Path to save the csv file.
        """
        if self.X is not None:
            columns_names = self._feature_names
            df = pd.DataFrame(self._X, columns=columns_names)
            df.to_csv(path, index=False)
        else:
            raise ValueError('Features array is empty!')