Source code for deepmol.compound_featurization.deepchem_featurizers

from typing import List, Dict, Any

import numpy as np
from deepchem.feat import ConvMolFeaturizer, WeaveFeaturizer, MolGraphConvFeaturizer, CoulombMatrix, CoulombMatrixEig, \
    SmilesToImage, SmilesToSeq, MolGanFeaturizer, GraphMatrix
from deepchem.feat.graph_data import GraphData
from deepchem.feat.mol_graphs import ConvMol, WeaveMol
from deepchem.utils import ConformerGenerator
from rdkit.Chem import Mol

from deepmol.compound_featurization import MolecularFeaturizer
from deepmol.compound_featurization._utils import get_conformers, get_dictionary_from_smiles
from deepmol.datasets import Dataset
from deepmol.loggers.logger import Logger
from deepmol.utils.utils import mol_to_smiles


[docs]class ConvMolFeat(MolecularFeaturizer):
    """
    Duvenaud graph convolution, adapted from deepchem
    (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#convmolfeaturizer).
    Vector of descriptors for each atom in a molecule.
    The featurizers computes that vector of local descriptors.

    References:
    Duvenaud, David K., et al. "Convolutional networks on graphs for learning molecular fingerprints."
    Advances in neural information processing systems. 2015.
    """

    def __init__(self,
                 master_atom: bool = False,
                 use_chirality: bool = False,
                 atom_properties: List[str] = None,
                 per_atom_fragmentation: bool = False,
                 **kwargs) -> None:
        """
        Parameters
        ----------
        master_atom: bool
            If True, create a fake atom with bonds to every other atom.
        use_chirality: bool
            If True, include chirality information.
        atom_properties: List[str]
            List of atom properties to use as additional atom-level features in the larger molecular feature.
        per_atom_fragmentation: bool
            If True, then multiple "atom-depleted" versions of each molecule will be created.
        kwargs:
            Additional arguments for the base class.
        """
        super().__init__(**kwargs)
        if atom_properties is None:
            atom_properties = []
        self.master_atom = master_atom
        self.use_chirality = use_chirality
        self.atom_properties = atom_properties
        self.per_atom_fragmentation = per_atom_fragmentation
        self.feature_names = ['conv_mol_feat']

    def _featurize(self, mol: Mol) -> ConvMol:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        mol: Mol
            Molecule to featurize.

        Returns
        -------
        feature: ConvMol
            The ConvMol features of the molecule.
        """
        # featurization process using DeepChem ConvMolFeaturizer
        feature = ConvMolFeaturizer(
            master_atom=self.master_atom,
            use_chirality=self.use_chirality,
            atom_properties=self.atom_properties,
            per_atom_fragmentation=self.per_atom_fragmentation).featurize([mol])

        assert feature[0].atom_features is not None
        return feature[0]


[docs]class WeaveFeat(MolecularFeaturizer):
    """
    Weave convolution featurization, adapted from deepchem
    (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#weavefeaturizer).
    Require a quadratic matrix of interaction descriptors for each pair of atoms.

    References:
    Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints."
    Journal of computer-aided molecular design 30.8 (2016): 595-608.
    """

    def __init__(self,
                 graph_distance: bool = True,
                 explicit_h: bool = False,
                 use_chirality: bool = False,
                 max_pair_distance: int = None,
                 **kwargs) -> None:
        """
        Parameters
        ----------
        graph_distance: bool
            If True, use graph distance for distance features. Otherwise, use Euclidean distance. Molecules invoked must
            have valid conformer information if this option is set.
        explicit_h: bool
            If true, model hydrogens in the molecule.
        use_chirality: bool
            If True, use chiral information in the featurization.
        max_pair_distance: int
            Maximum graph distance at which pair features are computed.
        kwargs:
            Additional arguments for the base class.
        """
        super().__init__(**kwargs)
        self.graph_distance = graph_distance
        self.explicit_h = explicit_h
        self.use_chirality = use_chirality
        self.max_pair_distance = max_pair_distance
        self.feature_names = ['weave_feat']

    def _featurize(self, mol: Mol) -> WeaveMol:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        mol: Mol
            Molecule to featurize.

        Returns
        -------
        feature: WeaveMol
            The WeaveMol features of the molecule.
        """
        # featurization process using DeepChem WeaveFeaturizer
        feature = WeaveFeaturizer(
            graph_distance=self.graph_distance,
            explicit_H=self.explicit_h,
            use_chirality=self.use_chirality,
            max_pair_distance=self.max_pair_distance).featurize([mol])

        assert feature[0].get_atom_features() is not None

        return feature[0]


[docs]class MolGanFeat(MolecularFeaturizer):
    """
    Featurizer for MolGAN de-novo molecular generation model, adapted from deepchem
    (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html?highlight=CGCNN#molganfeaturizer).
    It is wrapper for two matrices containing atom and bond type information.

    References:
    Nicola De Cao et al. “MolGAN: An implicit generative model for small molecular graphs” (2018),
    https://arxiv.org/abs/1805.11973
    """

    def __init__(self,
                 max_atom_count: int = 9,
                 kekulize: bool = True,
                 bond_labels: List[Any] = None,
                 atom_labels: List[int] = None,
                 **kwargs) -> None:
        """
        Parameters
        ----------
        max_atom_count: int
            Maximum number of atoms used for the adjacency matrix creation.
        kekulize: bool
            If True, kekulize the molecule.
        bond_labels: List[Any]
            List of bond types used for the adjacency matrix creation.
        atom_labels: List[int]
            List of atomic numbers used for the adjacency matrix creation.
        kwargs:
            Additional arguments for the base class.
        """
        super().__init__(**kwargs)
        self.max_atom_count = max_atom_count
        self.kekulize = kekulize
        self.bond_labels = bond_labels
        self.atom_labels = atom_labels
        self.feature_names = ['mol_gan_feat']

    def _featurize(self, mol: Mol) -> GraphMatrix:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        mol: Mol
            Molecule to featurize.

        Returns
        -------
        feature: WeaveMol
            The WeaveMol features of the molecule.
        """
        # featurization process using DeepChem MolGanFeat
        feature = MolGanFeaturizer(max_atom_count=self.max_atom_count,
                                   kekulize=self.kekulize,
                                   bond_labels=self.bond_labels,
                                   atom_labels=self.atom_labels).featurize(mol)

        assert feature[0].adjacency_matrix is not None

        return feature[0]


[docs]class MolGraphConvFeat(MolecularFeaturizer):
    """
    Featurizer of general graph convolution networks for molecules.
    Adapted from deepchem:
    (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#molgraphconvfeaturizer)

    References:
    Kearnes, Steven, et al. "Molecular graph convolutions: moving beyond fingerprints."
    Journal of computer-aided molecular design 30.8 (2016):595-608.
    """

    def __init__(self,
                 use_edges: bool = False,
                 use_chirality: bool = False,
                 use_partial_charge: bool = False,
                 **kwargs) -> None:
        """
        Parameters
        ----------
        use_edges: bool
            If True, use edge features.
        use_chirality: bool
            If True, use chirality information.
        use_partial_charge: bool
            If True, use partial charge information.
        kwargs:
            Additional arguments for the base class.
        """
        super().__init__(**kwargs)
        self.use_edges = use_edges
        self.use_chirality = use_chirality
        self.use_partial_charge = use_partial_charge
        self.feature_names = ['mol_graph_conv_feat']

    def _featurize(self, mol: Mol) -> GraphData:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        mol: Mol
            Molecule to featurize.

        Returns
        -------
        feature: GraphData
            The GraphData features of the molecule.
        """
        # featurization process using DeepChem MolGraphConvFeaturizer
        feature = MolGraphConvFeaturizer(
            use_edges=self.use_edges,
            use_chirality=self.use_chirality,
            use_partial_charge=self.use_partial_charge).featurize([mol])

        if feature[0].node_features is None:
            raise Exception

        return feature[0]


[docs]class CoulombFeat(MolecularFeaturizer):
    """
    Calculate coulomb matrices for molecules.
    Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#coulombmatrix).

    References:
    Montavon, Grégoire, et al. "Learning invariant representations of molecules for atomization energy prediction."
    Advances in neural information processing systems. 2012.
    """

    def __init__(self,
                 max_atoms: int,
                 remove_hydrogens: bool = False,
                 randomize: bool = False,
                 upper_tri: bool = False,
                 n_samples: int = 1,
                 max_conformers: int = 1,
                 seed: int = None,
                 **kwargs) -> None:
        """
        Parameters
        ----------
        max_atoms: int
            The maximum number of atoms expected for molecules this featurizers will process.
        remove_hydrogens: bool
            If True, remove hydrogens before processing them.
        randomize: bool
            If True, randomize Coulomb matrices. Default to False.
        upper_tri: bool
            Generate only upper triangle part of Coulomb matrices.
        n_samples: int
            If 'randomize' is set to True, the number of random samples to draw.
        max_conformers: int
            Maximum number of conformers.
        seed: int
            Random seed to use.
        kwargs:
            Additional arguments for the base class.
        """
        super().__init__(**kwargs)
        self.max_atoms = max_atoms
        self.remove_hydrogens = remove_hydrogens
        self.randomize = randomize
        self.upper_tri = upper_tri
        self.n_samples = n_samples
        self.max_conformers = max_conformers
        if seed is not None:
            seed = int(seed)
        self.seed = seed
        self.feature_names = ['coulomb_feat']

    def _featurize(self, mol: Mol) -> np.ndarray:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        mol: Mol
            Molecule to featurize.

        Returns
        -------
        feature: np.ndarray
            Array of features.
        """
        generator = ConformerGenerator(max_conformers=self.max_conformers)
        new_conformers = get_conformers([mol], generator)

        # featurization process using DeepChem CoulombMatrix
        featurizer = CoulombMatrix(
            max_atoms=self.max_atoms,
            remove_hydrogens=self.remove_hydrogens,
            randomize=self.randomize,
            upper_tri=self.upper_tri,
            n_samples=self.n_samples,
            seed=self.seed)

        feature = featurizer(new_conformers)

        if feature[0].size == 0:
            raise Exception

        return feature[0]


[docs]class CoulombEigFeat(MolecularFeaturizer):
    """
    Calculate the eigen values of Coulomb matrices for molecules.
    Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#coulombmatrixeig).

    References:
    Montavon, Grégoire, et al. "Learning invariant representations of molecules for atomization energy prediction."
    Advances in neural information processing systems. 2012.
    """

    def __init__(self,
                 max_atoms: int,
                 remove_hydrogens: bool = False,
                 randomize: bool = False,
                 n_samples: int = 1,
                 max_conformers: int = 1,
                 seed: int = None,
                 **kwargs) -> None:
        """
        Parameters
        ----------
        max_atoms: int
            The maximum number of atoms expected for molecules this featurizers will process.
        remove_hydrogens: bool
            If True, remove hydrogens before processing them.
        randomize: bool
            If True, randomize Coulomb matrices.
        n_samples: int
            If 'randomize' is set to True, the number of random samples to draw.
        max_conformers: int
            maximum number of conformers.
        seed: int
            Random seed to use.
        kwargs:
            Additional arguments for the base class.
        """
        super().__init__(**kwargs)
        self.max_atoms = max_atoms
        self.remove_hydrogens = remove_hydrogens
        self.randomize = randomize
        self.n_samples = n_samples
        if seed is not None:
            seed = int(seed)
        self.seed = seed
        self.max_conformers = max_conformers
        self.feature_names = ['coulomb_eig_feat']

    def _featurize(self, mol: Mol) -> np.ndarray:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        mol: Mol
            Molecule to featurize.
        Returns
        -------
        feature: np.ndarray
            Array of features.
        """
        generator = ConformerGenerator(max_conformers=self.max_conformers)

        # TO USE in case to add option for the software to find the parameter max_atoms
        # maximum_number_atoms = find_maximum_number_atoms(new_smiles)

        new_conformers = get_conformers([mol], generator)
        # featurization process using DeepChem CoulombMatrixEig
        featurizer = CoulombMatrixEig(
            max_atoms=self.max_atoms,
            remove_hydrogens=self.remove_hydrogens,
            randomize=self.randomize,
            n_samples=self.n_samples,
            seed=self.seed)

        feature = featurizer(new_conformers)

        if feature[0].size == 0:
            raise Exception

        return feature[0]


[docs]class SmileImageFeat(MolecularFeaturizer):
    """
    Converts SMILE string to image.
    Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#smilestoimage).

    References:
    Goh, Garrett B., et al. "Using rule-based labels for weak supervised learning: a ChemNet for transferable chemical
    property prediction."
    Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2018.
    """

    def __init__(self,
                 img_size: int = 80,
                 res: float = 0.5,
                 max_len: int = 250,
                 img_spec: str = "std",
                 **kwargs) -> None:
        """
        Parameters
        ----------
        img_size: int
            Size of the image tensor.
        res: float
            Displays the resolution of each pixel in Angstrom.
        max_len: int
            Maximum allowed length of SMILES string.
        img_spec: str
            Indicates the channel organization of the image tensor.
        kwargs:
            Additional arguments for the base class.
        """
        super().__init__(**kwargs)
        if img_spec not in ["std", "engd"]:
            raise ValueError(
                "Image mode must be one of the std or engd. {} is not supported".format(img_spec))
        self.img_size = img_size
        self.max_len = max_len
        self.res = res
        self.img_spec = img_spec
        self.embed = int(img_size * res / 2)
        self.feature_names = ['smile_image_feat']

    def _featurize(self, mol: Mol) -> np.ndarray:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        mol: Mol
            Molecule to featurize.

        Returns
        -------
        features: np.ndarray
            Array of features.
        """
        # featurization process using DeepChem SmilesToImage
        feats = SmilesToImage(
            img_size=self.img_size,
            max_len=self.max_len,
            res=self.res,
            img_spec=self.img_spec).featurize([mol])

        # identify which rows did not get featurized
        if len(feats[0]) == 0:
            raise Exception

        return feats


[docs]class SmilesSeqFeat:
    """
    Takes SMILES strings and turns into a sequence.
    Adapted from deepchem (https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#smilestoseq).

    References:
    Goh, Garrett B., et al. "Using rule-based labels for weak supervised learning: a ChemNet for transferable chemical
    property prediction."
    Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. 2018.
    """

    def __init__(self,
                 char_to_idx: Dict[str, int] = None,
                 max_len: int = 250,
                 pad_len: int = 10) -> None:
        """
        Parameters
        ----------
        char_to_idx: Dict
            Dictionary containing character to index mappings for unique characters.
        max_len: int
            Maximum allowed length of the SMILES string.
        pad_len: int
            Amount of padding to add on either side of the SMILES seq.
        """
        self.char_to_idx = char_to_idx
        self.max_len = max_len
        self.pad_len = pad_len
        self.feature_names = ['smiles_seq_feat']
        self.logger = Logger()

[docs]    def featurize(self, dataset: Dataset) -> Dataset:
        """
        Featurizes a single molecule.

        Parameters
        ----------
        dataset: Dataset
            Dataset to featurize.

        Returns
        -------
        dataset: Dataset
            Featurized dataset.
        """
        # Getting the dictionary if it is None
        if self.char_to_idx is None:
            if isinstance(dataset.mols[0], Mol):
                smiles = [mol_to_smiles(mol) for mol in dataset.mols if mol is not None]
            elif isinstance(dataset.mols[0], str):
                smiles = dataset.mols
            else:
                smiles = None

            self.char_to_idx = get_dictionary_from_smiles(smiles, self.max_len)

        dataset.dictionary = self.char_to_idx

        # obtain new SMILE's strings
        if isinstance(dataset.mols[0], str):
            rdkit_mols = [mol_to_smiles(mol) for mol in dataset.mols]
        elif isinstance(dataset.mols[0], Mol):
            rdkit_mols = dataset.mols
        else:
            rdkit_mols = None

        # featurization process using DeepChem SmilesToSeq
        dataset.X = SmilesToSeq(
            char_to_idx=self.char_to_idx,
            max_len=self.max_len,
            pad_len=self.pad_len).featurize(rdkit_mols)

        # identify which rows did not get featurized
        indexes = []
        for i, feat in enumerate(dataset.X):
            if len(feat) == 0:
                indexes.append(i)
        # treat indexes with no featurization
        dataset.remove_elements(indexes)
        dataset.X = np.asarray([np.asarray(feat, dtype=object) for feat in dataset.X])
        return dataset