Source code for deepmol.compound_featurization.deepchem_featurizers

from typing import List, Dict, Any

import numpy as np
from deepchem.feat import ConvMolFeaturizer, WeaveFeaturizer, MolGraphConvFeaturizer, CoulombMatrix, CoulombMatrixEig, \
    SmilesToImage, SmilesToSeq, MolGanFeaturizer, GraphMatrix
from deepchem.feat.graph_data import GraphData
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.utils import ConformerGenerator
from rdkit.Chem import Mol

from deepmol.compound_featurization import MolecularFeaturizer
from deepmol.compound_featurization._utils import get_conformers, get_dictionary_from_smiles
from deepmol.datasets import Dataset
from deepmol.loggers.logger import Logger
from deepmol.utils.utils import mol_to_smiles


[docs]class ConvMolFeat(MolecularFeaturizer): """ Duvenaud graph convolution, adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#convmolfeaturizer). Vector of descriptors for each atom in a molecule. The featurizers computes that vector of local descriptors. References: Duvenaud, David K., et al. "Convolutional networks on graphs for learning molecular fingerprints." Advances in neural information processing systems. 2015. """ def __init__(self, master_atom: bool = False, use_chirality: bool = False, atom_properties: List[str] = None, per_atom_fragmentation: bool = False, **kwargs) -> None: """ Parameters ---------- master_atom: bool If True, create a fake atom with bonds to every other atom. use_chirality: bool If True, include chirality information. atom_properties: List[str] List of atom properties to use as additional atom-level features in the larger molecular feature. per_atom_fragmentation: bool If True, then multiple "atom-depleted" versions of each molecule will be created. kwargs: Additional arguments for the base class. """ super().__init__(**kwargs) if atom_properties is None: atom_properties = [] self.master_atom = master_atom self.use_chirality = use_chirality self.atom_properties = atom_properties self.per_atom_fragmentation = per_atom_fragmentation self.feature_names = ['conv_mol_feat'] def _featurize(self, mol: Mol) -> ConvMol: """ Featurizes a single molecule. Parameters ---------- mol: Mol Molecule to featurize. Returns ------- feature: ConvMol The ConvMol features of the molecule. """ # featurization process using DeepChem ConvMolFeaturizer feature = ConvMolFeaturizer( master_atom=self.master_atom, use_chirality=self.use_chirality, atom_properties=self.atom_properties, per_atom_fragmentation=self.per_atom_fragmentation).featurize([mol]) assert feature[0].atom_features is not None return feature[0]
[docs]class WeaveFeat(MolecularFeaturizer): """ Weave convolution featurization, adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#weavefeaturizer). Require a quadratic matrix of interaction descriptors for each pair of atoms. References: Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints." Journal of computer-aided molecular design 30.8 (2016): 595-608. """ def __init__(self, graph_distance: bool = True, explicit_h: bool = False, use_chirality: bool = False, max_pair_distance: int = None, **kwargs) -> None: """ Parameters ---------- graph_distance: bool If True, use graph distance for distance features. Otherwise, use Euclidean distance. Molecules invoked must have valid conformer information if this option is set. explicit_h: bool If true, model hydrogens in the molecule. use_chirality: bool If True, use chiral information in the featurization. max_pair_distance: int Maximum graph distance at which pair features are computed. kwargs: Additional arguments for the base class. """ super().__init__(**kwargs) self.graph_distance = graph_distance self.explicit_h = explicit_h self.use_chirality = use_chirality self.max_pair_distance = max_pair_distance self.feature_names = ['weave_feat'] def _featurize(self, mol: Mol) -> WeaveMol: """ Featurizes a single molecule. Parameters ---------- mol: Mol Molecule to featurize. Returns ------- feature: WeaveMol The WeaveMol features of the molecule. """ # featurization process using DeepChem WeaveFeaturizer feature = WeaveFeaturizer( graph_distance=self.graph_distance, explicit_H=self.explicit_h, use_chirality=self.use_chirality, max_pair_distance=self.max_pair_distance).featurize([mol]) assert feature[0].get_atom_features() is not None return feature[0]
[docs]class MolGanFeat(MolecularFeaturizer): """ Featurizer for MolGAN de-novo molecular generation model, adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html?highlight=CGCNN#molganfeaturizer). It is wrapper for two matrices containing atom and bond type information. References: Nicola De Cao et al. “MolGAN: An implicit generative model for small molecular graphs” (2018), https://arxiv.org/abs/1805.11973 """ def __init__(self, max_atom_count: int = 9, kekulize: bool = True, bond_labels: List[Any] = None, atom_labels: List[int] = None, **kwargs) -> None: """ Parameters ---------- max_atom_count: int Maximum number of atoms used for the adjacency matrix creation. kekulize: bool If True, kekulize the molecule. bond_labels: List[Any] List of bond types used for the adjacency matrix creation. atom_labels: List[int] List of atomic numbers used for the adjacency matrix creation. kwargs: Additional arguments for the base class. """ super().__init__(**kwargs) self.max_atom_count = max_atom_count self.kekulize = kekulize self.bond_labels = bond_labels self.atom_labels = atom_labels self.feature_names = ['mol_gan_feat'] def _featurize(self, mol: Mol) -> GraphMatrix: """ Featurizes a single molecule. Parameters ---------- mol: Mol Molecule to featurize. Returns ------- feature: WeaveMol The WeaveMol features of the molecule. """ # featurization process using DeepChem MolGanFeat feature = MolGanFeaturizer(max_atom_count=self.max_atom_count, kekulize=self.kekulize, bond_labels=self.bond_labels, atom_labels=self.atom_labels).featurize(mol) assert feature[0].adjacency_matrix is not None return feature[0]
[docs]class MolGraphConvFeat(MolecularFeaturizer): """ Featurizer of general graph convolution networks for molecules. Adapted from deepchem: (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#molgraphconvfeaturizer) References: Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints." Journal of computer-aided molecular design 30.8 (2016):595-608. """ def __init__(self, use_edges: bool = False, use_chirality: bool = False, use_partial_charge: bool = False, **kwargs) -> None: """ Parameters ---------- use_edges: bool If True, use edge features. use_chirality: bool If True, use chirality information. use_partial_charge: bool If True, use partial charge information. kwargs: Additional arguments for the base class. """ super().__init__(**kwargs) self.use_edges = use_edges self.use_chirality = use_chirality self.use_partial_charge = use_partial_charge self.feature_names = ['mol_graph_conv_feat'] def _featurize(self, mol: Mol) -> GraphData: """ Featurizes a single molecule. Parameters ---------- mol: Mol Molecule to featurize. Returns ------- feature: GraphData The GraphData features of the molecule. """ # featurization process using DeepChem MolGraphConvFeaturizer feature = MolGraphConvFeaturizer( use_edges=self.use_edges, use_chirality=self.use_chirality, use_partial_charge=self.use_partial_charge).featurize([mol]) if feature[0].node_features is None: raise Exception return feature[0]
[docs]class CoulombFeat(MolecularFeaturizer): """ Calculate coulomb matrices for molecules. Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#coulombmatrix). References: Montavon, Grégoire, et al. "Learning invariant representations of molecules for atomization energy prediction." Advances in neural information processing systems. 2012. """ def __init__(self, max_atoms: int, remove_hydrogens: bool = False, randomize: bool = False, upper_tri: bool = False, n_samples: int = 1, max_conformers: int = 1, seed: int = None, **kwargs) -> None: """ Parameters ---------- max_atoms: int The maximum number of atoms expected for molecules this featurizers will process. remove_hydrogens: bool If True, remove hydrogens before processing them. randomize: bool If True, randomize Coulomb matrices. Default to False. upper_tri: bool Generate only upper triangle part of Coulomb matrices. n_samples: int If 'randomize' is set to True, the number of random samples to draw. max_conformers: int Maximum number of conformers. seed: int Random seed to use. kwargs: Additional arguments for the base class. """ super().__init__(**kwargs) self.max_atoms = max_atoms self.remove_hydrogens = remove_hydrogens self.randomize = randomize self.upper_tri = upper_tri self.n_samples = n_samples self.max_conformers = max_conformers if seed is not None: seed = int(seed) self.seed = seed self.feature_names = ['coulomb_feat'] def _featurize(self, mol: Mol) -> np.ndarray: """ Featurizes a single molecule. Parameters ---------- mol: Mol Molecule to featurize. Returns ------- feature: np.ndarray Array of features. """ generator = ConformerGenerator(max_conformers=self.max_conformers) new_conformers = get_conformers([mol], generator) # featurization process using DeepChem CoulombMatrix featurizer = CoulombMatrix( max_atoms=self.max_atoms, remove_hydrogens=self.remove_hydrogens, randomize=self.randomize, upper_tri=self.upper_tri, n_samples=self.n_samples, seed=self.seed) feature = featurizer(new_conformers) if feature[0].size == 0: raise Exception return feature[0]
[docs]class CoulombEigFeat(MolecularFeaturizer): """ Calculate the eigen values of Coulomb matrices for molecules. Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#coulombmatrixeig). References: Montavon, Grégoire, et al. "Learning invariant representations of molecules for atomization energy prediction." Advances in neural information processing systems. 2012. """ def __init__(self, max_atoms: int, remove_hydrogens: bool = False, randomize: bool = False, n_samples: int = 1, max_conformers: int = 1, seed: int = None, **kwargs) -> None: """ Parameters ---------- max_atoms: int The maximum number of atoms expected for molecules this featurizers will process. remove_hydrogens: bool If True, remove hydrogens before processing them. randomize: bool If True, randomize Coulomb matrices. n_samples: int If 'randomize' is set to True, the number of random samples to draw. max_conformers: int maximum number of conformers. seed: int Random seed to use. kwargs: Additional arguments for the base class. """ super().__init__(**kwargs) self.max_atoms = max_atoms self.remove_hydrogens = remove_hydrogens self.randomize = randomize self.n_samples = n_samples if seed is not None: seed = int(seed) self.seed = seed self.max_conformers = max_conformers self.feature_names = ['coulomb_eig_feat'] def _featurize(self, mol: Mol) -> np.ndarray: """ Featurizes a single molecule. Parameters ---------- mol: Mol Molecule to featurize. Returns ------- feature: np.ndarray Array of features. """ generator = ConformerGenerator(max_conformers=self.max_conformers) # TO USE in case to add option for the software to find the parameter max_atoms # maximum_number_atoms = find_maximum_number_atoms(new_smiles) new_conformers = get_conformers([mol], generator) # featurization process using DeepChem CoulombMatrixEig featurizer = CoulombMatrixEig( max_atoms=self.max_atoms, remove_hydrogens=self.remove_hydrogens, randomize=self.randomize, n_samples=self.n_samples, seed=self.seed) feature = featurizer(new_conformers) if feature[0].size == 0: raise Exception return feature[0]
[docs]class SmileImageFeat(MolecularFeaturizer): """ Converts SMILE string to image. Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#smilestoimage). References: Goh, Garrett B., et al. "Using rule-based labels for weak supervised learning: a ChemNet for transferable chemical property prediction." Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2018. """ def __init__(self, img_size: int = 80, res: float = 0.5, max_len: int = 250, img_spec: str = "std", **kwargs) -> None: """ Parameters ---------- img_size: int Size of the image tensor. res: float Displays the resolution of each pixel in Angstrom. max_len: int Maximum allowed length of SMILES string. img_spec: str Indicates the channel organization of the image tensor. kwargs: Additional arguments for the base class. """ super().__init__(**kwargs) if img_spec not in ["std", "engd"]: raise ValueError( "Image mode must be one of the std or engd. {} is not supported".format(img_spec)) self.img_size = img_size self.max_len = max_len self.res = res self.img_spec = img_spec self.embed = int(img_size * res / 2) self.feature_names = ['smile_image_feat'] def _featurize(self, mol: Mol) -> np.ndarray: """ Featurizes a single molecule. Parameters ---------- mol: Mol Molecule to featurize. Returns ------- features: np.ndarray Array of features. """ # featurization process using DeepChem SmilesToImage feats = SmilesToImage( img_size=self.img_size, max_len=self.max_len, res=self.res, img_spec=self.img_spec).featurize([mol]) # identify which rows did not get featurized if len(feats[0]) == 0: raise Exception return feats
[docs]class SmilesSeqFeat: """ Takes SMILES strings and turns into a sequence. Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#smilestoseq). References: Goh, Garrett B., et al. "Using rule-based labels for weak supervised learning: a ChemNet for transferable chemical property prediction." Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2018. """ def __init__(self, char_to_idx: Dict[str, int] = None, max_len: int = 250, pad_len: int = 10) -> None: """ Parameters ---------- char_to_idx: Dict Dictionary containing character to index mappings for unique characters. max_len: int Maximum allowed length of the SMILES string. pad_len: int Amount of padding to add on either side of the SMILES seq. """ self.char_to_idx = char_to_idx self.max_len = max_len self.pad_len = pad_len self.feature_names = ['smiles_seq_feat'] self.logger = Logger()
[docs] def featurize(self, dataset: Dataset) -> Dataset: """ Featurizes a single molecule. Parameters ---------- dataset: Dataset Dataset to featurize. Returns ------- dataset: Dataset Featurized dataset. """ # Getting the dictionary if it is None if self.char_to_idx is None: if isinstance(dataset.mols[0], Mol): smiles = [mol_to_smiles(mol) for mol in dataset.mols if mol is not None] elif isinstance(dataset.mols[0], str): smiles = dataset.mols else: smiles = None self.char_to_idx = get_dictionary_from_smiles(smiles, self.max_len) dataset.dictionary = self.char_to_idx # obtain new SMILE's strings if isinstance(dataset.mols[0], str): rdkit_mols = [mol_to_smiles(mol) for mol in dataset.mols] elif isinstance(dataset.mols[0], Mol): rdkit_mols = dataset.mols else: rdkit_mols = None # featurization process using DeepChem SmilesToSeq dataset.X = SmilesToSeq( char_to_idx=self.char_to_idx, max_len=self.max_len, pad_len=self.pad_len).featurize(rdkit_mols) # identify which rows did not get featurized indexes = [] for i, feat in enumerate(dataset.X): if len(feat) == 0: indexes.append(i) # treat indexes with no featurization dataset.remove_elements(indexes) dataset.X = np.asarray([np.asarray(feat, dtype=object) for feat in dataset.X]) return dataset