Source code for deepmol.compound_featurization.base_featurizer

from abc import ABC, abstractmethod
from typing import Tuple

import numpy as np
from rdkit.Chem import Mol, MolToSmiles

from deepmol.datasets import Dataset
from deepmol.loggers.logger import Logger
from deepmol.parallelism.multiprocessing import JoblibMultiprocessing
from deepmol.scalers import BaseScaler
from deepmol.utils.errors import PreConditionViolationException
from deepmol.utils.utils import canonicalize_mol_object


[docs]class MolecularFeaturizer(ABC): """ Abstract class for calculating a set of features for a molecule. A `MolecularFeaturizer` uses SMILES strings or RDKit molecule objects to represent molecules. Subclasses need to implement the _featurize method for calculating features for a single molecule. """ def __init__(self, n_jobs: int = -1) -> None: """ Initializes the featurizer. Parameters ---------- n_jobs: int The number of jobs to run in parallel in the featurization. """ self.n_jobs = n_jobs self.feature_names = None self.logger = Logger() def _featurize_mol(self, mol: Mol) -> Tuple[np.ndarray, bool]: """ Calculate features for a single molecule. Parameters ---------- mol: Mol The molecule to featurize. Returns ------- features: np.ndarray The features for the molecule. remove_mol: bool Whether the molecule should be removed from the dataset. """ try: mol = canonicalize_mol_object(mol) feat = self._featurize(mol) remove_mol = False return feat, remove_mol except PreConditionViolationException: exit(1) except Exception as e: if mol is not None: smiles = MolToSmiles(mol) else: smiles = None self.logger = Logger() self.logger.error(f"Failed to featurize {smiles}. Appending empty array") self.logger.error("Exception message: {}".format(e)) remove_mol = True return np.array([]), remove_mol
[docs] def featurize(self, dataset: Dataset, scaler: BaseScaler = None, path_to_save_scaler: str = None, remove_nans_axis: int = 0 ) -> Dataset: """ Calculate features for molecules. Parameters ---------- dataset: Dataset The dataset containing the molecules to featurize in dataset.mols. scaler: BaseScaler The scaler to use for scaling the generated features. path_to_save_scaler: str The path to save the scaler to. remove_nans_axis: int The axis to remove NaNs from. If None, no NaNs are removed. Returns ------- dataset: Dataset The input Dataset containing a featurized representation of the molecules in Dataset.X. """ molecules = dataset.mols multiprocessing_cls = JoblibMultiprocessing(process=self._featurize_mol, n_jobs=self.n_jobs) features = multiprocessing_cls.run(molecules) features, remove_mols = zip(*features) remove_mols_list = np.array(remove_mols) dataset.remove_elements(dataset.ids[remove_mols_list]) features = np.array(features) features = features[~remove_mols_list] if (isinstance(features[0], np.ndarray) and len(features[0].shape) == 2) or not isinstance(features[0], np.ndarray): pass else: features = np.vstack(features) dataset._X = features dataset.feature_names = self.feature_names dataset.remove_nan(remove_nans_axis) if scaler and path_to_save_scaler: # transform data scaler.fit_transform(dataset) scaler.save(path_to_save_scaler) elif scaler: scaler.transform(dataset) return dataset
@abstractmethod def _featurize(self, mol: Mol): raise NotImplementedError