Source code for deepmol.standardizer.molecular_standardizer

from abc import ABC, abstractmethod
from typing import Tuple

import numpy as np
from rdkit.Chem import Mol

from deepmol.base import Transformer
from deepmol.datasets import Dataset
from deepmol.loggers.logger import Logger
from deepmol.parallelism.multiprocessing import JoblibMultiprocessing
from deepmol.utils.decorators import modify_object_inplace_decorator
from deepmol.utils.utils import canonicalize_mol_object, mol_to_smiles

from rdkit import RDLogger 


[docs]class MolecularStandardizer(ABC, Transformer): """ Class for handling the standardization of molecules. """ def __init__(self, n_jobs: int = -1) -> None: """ Standardizer for molecules. Parameters ---------- n_jobs: int Number of jobs to run in parallel. """ super().__init__() self.n_jobs = n_jobs RDLogger.DisableLog('rdApp.info') self.logger = Logger() self.logger.info(f"Standardizer {self.__class__.__name__} initialized with {n_jobs} jobs.") def _standardize_mol(self, mol: Mol) -> Tuple[Mol, str]: """ Standardizes a single molecule. Parameters ---------- mol: Mol Molecule to standardize. Returns ------- mol: Mol Standardized Mol object. smiles: str Standardized SMILES string. """ try: mol_object = mol assert mol_object is not None mol_object = canonicalize_mol_object(mol_object) standardized_mol = self._standardize(mol_object) return standardized_mol, mol_to_smiles(standardized_mol, canonical=True) except Exception: return mol, mol_to_smiles(mol, canonical=True) @modify_object_inplace_decorator def standardize(self, dataset: Dataset) -> Dataset: """ Standardizes a dataset of molecules. Parameters ---------- dataset: Dataset Dataset to standardize. Returns ------- dataset: Dataset Standardized dataset. """ molecules = dataset.mols multiprocessing_cls = JoblibMultiprocessing(n_jobs=self.n_jobs, process=self._standardize_mol) result = list(multiprocessing_cls.run(molecules)) dataset._smiles = np.asarray([x[1] for x in result]) dataset._mols = np.asarray([x[0] for x in result]) return dataset @abstractmethod def _standardize(self, mol: Mol) -> Mol: """ Standardizes a molecule. Parameters ---------- mol: Mol RDKit Mol object Returns ------- mol: Mol Standardized mol. """ def _transform(self, dataset: Dataset) -> Dataset: """ Standardizes a dataset of molecules. This method is called by the `transform` method. Parameters ---------- dataset: Dataset Dataset to standardize. Returns ------- dataset: Dataset Standardized dataset. """ return self.standardize(dataset) def _fit(self, dataset: Dataset) -> 'MolecularStandardizer': """ Fits the standardizer to a dataset of molecules. Parameters ---------- dataset: Dataset Dataset of molecules. Returns ------- self: CustomStandardizer The fitted standardizer. """ return self