Source code for deepmol.compound_featurization.mol2vec

import os
from typing import Iterable

import numpy as np
from gensim.models import Word2Vec, word2vec
from mol2vec.features import mol2alt_sentence, MolSentence
from rdkit.Chem import Mol

from deepmol.compound_featurization import MolecularFeaturizer


[docs]def sentences2vec(sentences: Iterable, model: Word2Vec, unseen: str = None): """ Generate vectors for each sentence (list) in a list of sentences. Vector is simply a sum of vectors for individual words. Parameters ---------- sentences : Iterable List with sentences model : Word2Vec Gensim Word2Vec model unseen : None, str Keyword for unseen words. If None, those words are skipped. https://stats.stackexchange.com/questions/163005/how-to-set-the-dictionary-for-text-analysis-using-neural-networks/163032#163032 Returns ------- np.array Array of vectors for each sentence. """ keys = set(model.wv.key_to_index) vec = [] for sentence in sentences: if unseen: unseen_vec = model.wv.get_vector(unseen) vec.append(sum([model.wv.get_vector(y) if y in set(sentence) & keys else unseen_vec for y in sentence])) else: vec.append(sum([model.wv.get_vector(y) for y in sentence if y in set(sentence) & keys])) return np.array(vec)
[docs]class Mol2Vec(MolecularFeaturizer): """ Mol2Vec fingerprint implementation from https://doi.org/10.1021/acs.jcim.7b00616 Inspired by natural language processing techniques, Mol2vec, which is an unsupervised machine learning approach to learn vector representations of molecular substructures. Mol2vec learns vector representations of molecular substructures that point in similar directions for chemically related substructures. Compounds can finally be encoded as vectors by summing the vectors of the individual substructures and, for instance, be fed into supervised machine learning approaches to predict compound properties. """ def __init__(self, pretrain_model_path: str = None, radius: int = 1, unseen: str = 'UNK', gather_method: str = 'sum'): """ Parameters ---------- pretrain_model_path: str Path to pretrained model. If this value is None, we use the model_300dim.pkl model. The model is trained on 20 million compounds downloaded from ZINC. radius: int The fingerprint radius. The default value was used to train the model_300dim.pkl model. unseen: str The string to used to replace uncommon words/identifiers while training. gather_method: str How to aggregate vectors of identifiers are extracted from Mol2vec. 'sum' or 'mean' is supported. """ super().__init__() self.radius = radius self.unseen = unseen self.gather_method = gather_method self.sentences2vec = sentences2vec if pretrain_model_path is None: BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) pretrain_model_path = os.path.join(BASE_DIR, "compound_featurization", "mol2vec_models", "model_300dim.pkl") self.model = word2vec.Word2Vec.load(pretrain_model_path) self.feature_names = [f"mol2vec_{i}" for i in range(self.model.vector_size)] def _featurize(self, mol: Mol): """ Calculate mol2vec fingerprints. Parameters ---------- mol: Mol RDKit Mol object Returns ------- features: np.ndarray 1D array of mol2vec fingerprint. The default length is 300. """ # try: sentence = MolSentence(mol2alt_sentence(mol, self.radius)) vec_identifiers = self.sentences2vec( sentence, self.model, unseen=self.unseen) if self.gather_method == 'sum': feature = np.sum(vec_identifiers, axis=0) elif self.gather_method == 'mean': feature = np.mean(vec_identifiers, axis=0) else: raise ValueError( 'Not supported gather_method type. Please set "sum" or "mean"') return feature