Source code for deepmol.compound_featurization.rdkit_fingerprints

import numpy as np
from rdkit.Chem import Mol, rdMolDescriptors, MACCSkeys, rdmolops
from rdkit.Chem.rdMolDescriptors import GetAtomPairAtomCode

from deepmol.compound_featurization import MolecularFeaturizer


[docs]class MorganFingerprint(MolecularFeaturizer): """ Morgan fingerprints. Extended Connectivity Circular Fingerprints compute a bag-of-words style representation of a molecule by breaking it into local neighborhoods and hashing into a bit vector of the specified size. """ def __init__(self, radius: int = 2, size: int = 2048, chiral: bool = False, bonds: bool = True, features: bool = False, **kwargs): """ Initialize a MorganFingerprint object. Parameters ---------- radius: int The radius of the circular fingerprint. size: int The size of the fingerprint. chiral: bool Whether to include chirality in the fingerprint. bonds: bool Whether to consider bond order in fingerprint generation. features: bool Whether to use feature information instead of atom information. """ super().__init__(**kwargs) self.radius = radius self.size = size self.chiral = chiral self.bonds = bonds self.features = features self.feature_names = [f'morgan_{i}' for i in range(self.size)] def _featurize(self, mol: Mol) -> np.ndarray: """ Calculate morgan fingerprint for a single molecule. Parameters ---------- mol: Mol RDKit Mol object Returns ------- fp: np.ndarray A numpy array of circular fingerprint. """ fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, self.radius, nBits=self.size, useChirality=self.chiral, useBondTypes=self.bonds, useFeatures=self.features) fp = np.asarray(fp, dtype=np.float32) return fp
[docs]class MACCSkeysFingerprint(MolecularFeaturizer): """ MACCS Keys. SMARTS-based implementation of the 166 public MACCS keys. """ def __init__(self, **kwargs): """ Initialize a MACCSkeysFingerprint object. """ super().__init__(**kwargs) self.feature_names = [f'maccs_{i}' for i in range(167)] def _featurize(self, mol: Mol) -> np.ndarray: """ Calculate MACCSkeys for a single molecule. Parameters ---------- mol: Mol RDKit Mol object Returns ------- fp: np.ndarray A numpy array of MACCSkeys. """ fp = MACCSkeys.GenMACCSKeys(mol) fp = np.asarray(fp, dtype=np.float32) return fp
[docs]class LayeredFingerprint(MolecularFeaturizer): """ Calculate layered fingerprint for a single molecule. Layer definitions: 0x01: pure topology 0x02: bond order 0x04: atom types 0x08: presence of rings 0x10: ring sizes 0x20: aromaticity """ def __init__(self, layerFlags: int = 4294967295, minPath: int = 1, maxPath: int = 7, fpSize: int = 2048, atomCounts: list = None, branchedPaths: bool = True, **kwargs): """ Initialize a LayeredFingerprint object. Parameters ---------- layerFlags: int A bit vector specifying which layers to include in the fingerprint. minPath: int The minimum number of bonds to include in the subgraphs. maxPath: int The maximum number of bonds to include in the subgraphs. fpSize: int The size of the fingerprint. atomCounts: None If provided, this should be a list at least as long as the number of atoms in the molecule. It will be used to provide the count of the number of paths that set bits each atom is involved in. branchedPaths: bool Whether to include branched and unbranched paths in the fingerprint. """ super().__init__(**kwargs) if atomCounts is None: atomCounts = [] self.layerFlags = layerFlags self.minPath = minPath self.maxPath = maxPath self.fpSize = fpSize self.atomCounts = atomCounts self.branchedPaths = branchedPaths self.feature_names = [f'layered_{i}' for i in range(self.fpSize)] def _featurize(self, mol: Mol) -> np.ndarray: """ Calculate layered fingerprint for a single molecule. Parameters ---------- mol: Mol RDKit Mol object Returns ------- fp: np.ndarray A numpy array of layered fingerprints. """ fp = rdmolops.LayeredFingerprint(mol, layerFlags=self.layerFlags, minPath=self.minPath, maxPath=self.maxPath, fpSize=self.fpSize, atomCounts=self.atomCounts, branchedPaths=self.branchedPaths) fp = np.asarray(fp, dtype=np.float32) return fp
[docs]class RDKFingerprint(MolecularFeaturizer): """ RDKit topological fingerprints This algorithm functions by find all subgraphs between minPath and maxPath in length. For each subgraph: A hash is calculated. The hash is used to seed a random-number generator _nBitsPerHash_ random numbers are generated and used to set the corresponding bits in the fingerprint """ def __init__(self, minPath: int = 1, maxPath: int = 7, fpSize: int = 2048, nBitsPerHash: int = 2, useHs: bool = True, tgtDensity: float = 0.0, minSize: int = 128, branchedPaths: bool = True, useBondOrder: bool = True, **kwargs): """ Initialize a RDKFingerprint object. Parameters ---------- minPath: int The minimum number of bonds to include in the subgraphs. maxPath: int The maximum number of bonds to include in the subgraphs. fpSize: int The size of the fingerprint. nBitsPerHash: int The number of bits to set for each hash. useHs: bool Whether to include Hs in the subgraphs. tgtDensity: float Fold the fingerprint until this minimum density has been reached. minSize: int The minimum size the fingerprint will be folded to when trying to reach tgtDensity. branchedPaths: bool Whether to include branched and unbranched paths in the fingerprint. useBondOrder: bool If True, both bond orders will be used in the path hashes """ super().__init__(**kwargs) self.minPath = minPath self.maxPath = maxPath self.fpSize = fpSize self.nBitsPerHash = nBitsPerHash self.useHs = useHs self.tgtDensity = tgtDensity self.minSize = minSize self.branchedPaths = branchedPaths self.useBondOrder = useBondOrder self.feature_names = [f'rdk_{i}' for i in range(self.fpSize)] def _featurize(self, mol: Mol) -> np.ndarray: """ Calculate topological fingerprint for a single molecule. Parameters ---------- mol: Mol RDKit Mol object Returns ------- fp: np.ndarray A numpy array of layered fingerprints. """ fp = rdmolops.RDKFingerprint(mol, minPath=self.minPath, maxPath=self.maxPath, fpSize=self.fpSize, nBitsPerHash=self.nBitsPerHash, useHs=self.useHs, tgtDensity=self.tgtDensity, minSize=self.minSize, branchedPaths=self.branchedPaths, useBondOrder=self.useBondOrder) fp = np.asarray(fp, dtype=np.float32) return fp
[docs]class AtomPairFingerprint(MolecularFeaturizer): """ Atom pair fingerprints Returns the atom-pair fingerprint for a molecule as an ExplicitBitVect """ def __init__(self, nBits: int = 2048, minLength: int = 1, maxLength: int = 30, nBitsPerEntry: int = 4, includeChirality: bool = False, use2D: bool = True, confId: int = -1, **kwargs): """ Initialize an AtomPairFingerprint object. Parameters ---------- nBits: int The size of the fingerprint. minLength: int Minimum distance between atoms to be considered in a pair. maxLength: int Maximum distance between atoms to be considered in a pair. nBitsPerEntry: int The number of bits to use in simulating counts. includeChirality: bool If set, chirality will be used in the atom invariants. use2D: bool If set, the 2D (topological) distance matrix is used. confId: int The conformation to use if 3D distances are being used return a pointer to the fingerprint. """ super().__init__(**kwargs) self.nBits = nBits self.minLength = minLength self.maxLength = maxLength self.nBitsPerEntry = nBitsPerEntry self.includeChirality = includeChirality self.use2D = use2D self.confId = confId self.feature_names = [f'atom_pair_{i}' for i in range(self.nBits)] def _featurize(self, mol: Mol) -> np.ndarray: """ Calculate atom pair fingerprint for a single molecule. Parameters ---------- mol: Mol RDKit Mol object Returns ------- fp: np.ndarray A numpy array of layered fingerprints. """ fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=self.nBits, minLength=self.minLength, maxLength=self.maxLength, nBitsPerEntry=self.nBitsPerEntry, includeChirality=self.includeChirality, use2D=self.use2D, confId=self.confId) fp = np.asarray(fp, dtype=np.float32) return fp
[docs]class AtomPairFingerprintCallbackHash(MolecularFeaturizer): """ Atom pair fingerprints Returns the atom-pair fingerprint for a molecule as an ExplicitBitVect """ def __init__(self, nBits: int = 2048, minLength: int = 1, maxLength: int = 30, includeChirality: bool = False, use2D: bool = True, confId: int = -1, **kwargs): """ Initialize an AtomPairFingerprintCallbackHash object. Parameters ---------- nBits: int The size of the fingerprint. minLength: int Minimum distance between atoms to be considered in a pair. maxLength: int Maximum distance between atoms to be considered in a pair. includeChirality: bool If set, chirality will be used in the atom invariants. use2D: bool If set, the 2D (topological) distance matrix is used. confId: int The conformation to use if 3D distances are being used return a pointer to the fingerprint. """ super().__init__(**kwargs) self.nBits = nBits self.minLength = minLength self.maxLength = maxLength self.includeChirality = includeChirality self.use2D = use2D self.confId = confId self.feature_names = [f'atom_pair_hash_{i}' for i in range(self.nBits)]
[docs] @staticmethod def hash_function(bit, value): """ Hash function for atom pair fingerprint. Parameters ---------- bit: int The bit to be hashed. value: int The value to be hashed. """ bit = hash(value) + 0x9e3779b9 + (bit * (2 ** 6)) + (bit / (2 ** 2)) return bit
def _featurize(self, mol: Mol) -> np.ndarray: """ Calculate AtomPairFingerprintCallbackHash for a single molecule. Parameters ---------- mol: Mol RDKit Mol object Returns ------- fp: np.ndarray A numpy array of layered fingerprints. """ matrix = rdmolops.GetDistanceMatrix(mol) fp = [0] * self.nBits for at1 in range(mol.GetNumAtoms()): for at2 in range(at1 + 1, mol.GetNumAtoms()): atom1 = mol.GetAtomWithIdx(at1) atom2 = mol.GetAtomWithIdx(at2) at1_hash_code = GetAtomPairAtomCode(atom1, includeChirality=self.includeChirality) at2_hash_code = GetAtomPairAtomCode(atom2, includeChirality=self.includeChirality) if self.minLength <= int(matrix[at1][at2]) <= self.maxLength: bit = self.hash_function(0, min(at1_hash_code, at2_hash_code)) bit = self.hash_function(bit, matrix[at1][at2]) bit = self.hash_function(bit, max(at1_hash_code, at2_hash_code)) index = int(bit % self.nBits) fp[index] = 1 fp = np.asarray(fp, dtype=np.float32) return fp