Source code for deepmol.utils.utils

import pandas as pd
import joblib
import os
from typing import Any, cast, IO, List, Union
import gzip
import pickle
import numpy as np

from rdkit.Chem import Mol, rdmolfiles, rdmolops

from rdkit import Chem


[docs]def get_class(name: str) -> object: """ Get a class from a string. Parameters ---------- name: str A class name you want to get. Returns ------- object A class object. """ components = name.split(".") mod = __import__(".".join(components[:-1]), fromlist=[components[-1]]) return getattr(mod, components[-1])
[docs]def smiles_to_mol(smiles: str, **kwargs) -> Union[Mol, None]: """ Convert SMILES to RDKit molecule object. Parameters ---------- smiles: str SMILES string to convert. kwargs: Keyword arguments for `rdkit.Chem.MolFromSmiles`. Returns ------- Mol RDKit molecule object. """ try: return Chem.MolFromSmiles(smiles, **kwargs) except TypeError: return None
[docs]def mol_to_smiles(mol: Mol, **kwargs) -> Union[str, None]: """ Convert SMILES to RDKit molecule object. Parameters ---------- mol: Mol RDKit molecule object to convert. kwargs: Keyword arguments for `rdkit.Chem.MolToSmiles`. Returns ------- smiles: str SMILES string. """ try: return Chem.MolToSmiles(mol, **kwargs) except TypeError: return None
[docs]def canonicalize_mol_object(mol_object: Mol) -> Mol: """ Canonicalize a molecule object. Parameters ---------- mol_object: Mol Molecule object to canonicalize. Returns ------- Mol Canonicalized molecule object. """ try: # SMILES is unique, so set a canonical order of atoms new_order = rdmolfiles.CanonicalRankAtoms(mol_object) mol_object = rdmolops.RenumberAtoms(mol_object, new_order) except Exception as e: mol_object = mol_object return mol_object
[docs]def load_pickle_file(input_file: str) -> Any: """ Load from single, possibly gzipped, pickle file. Parameters ---------- input_file: str The filename of pickle file. This function can load from gzipped pickle file like `XXXX.pkl.gz`. Returns ------- Any The object which is loaded from the pickle file. """ if ".gz" in input_file: with gzip.open(input_file, "rb") as unzipped_file: return pickle.load(cast(IO[bytes], unzipped_file)) else: with open(input_file, "rb") as opened_file: return pickle.load(opened_file)
[docs]def load_from_disk(filename: str) -> Any: """ Load object from file. Parameters ---------- filename: str A filename you want to load. Returns ------- Any A loaded object from file. """ name = filename if os.path.splitext(name)[1] == ".gz": name = os.path.splitext(name)[0] extension = os.path.splitext(name)[1] if extension == ".pkl": return load_pickle_file(filename) elif extension == ".joblib": return joblib.load(filename) elif extension == ".csv": # First line of user-specified CSV *must* be header. df = pd.read_csv(filename, header=0) df = df.replace(np.nan, str(""), regex=True) return df elif extension == ".npy": return np.load(filename, allow_pickle=True) else: raise ValueError("Unrecognized filetype for %s" % filename)
[docs]def normalize_labels_shape(y_pred: Union[List, np.ndarray], n_tasks: int) -> np.ndarray: """ Function to transform output from predict_proba (prob(0) prob(1)) to predict format (0 or 1). Parameters ---------- y_pred: array array with predictions n_tasks: int number of tasks Returns ------- labels Array of predictions in the format [0, 1, 0, ...]/[[0, 1, 0, ...], [0, 1, 1, ...], ...] """ if not isinstance(y_pred, np.ndarray): y_pred = np.array(y_pred) if n_tasks == 1: labels = _normalize_singletask_labels_shape(y_pred) else: if len(y_pred.shape) == 3: if y_pred.shape[2] > 1: y_pred = np.array([np.array([j[1] for j in i]) for i in y_pred]) else: y_pred = y_pred.reshape(y_pred.shape[0], y_pred.shape[1]) labels = [] for task in y_pred: labels.append(_normalize_singletask_labels_shape(task)) labels = np.array(labels) return labels
def _normalize_singletask_labels_shape(y_pred: Union[List, np.ndarray]) -> np.ndarray: """ Function to transform output from predict_proba (prob(0) prob(1)) to predict format (0 or 1). Parameters ---------- y_pred: array array with predictions Returns ------- labels Array of predictions in the format [0, 1, 0, ...]/[[0, 1, 0, ...], [0, 1, 1, ...], ...] """ labels = [] # list of probabilities in the format [0.1, 0.9, 0.2, ...] if isinstance(y_pred[0], (np.floating, float)): return np.array(y_pred) elif isinstance(y_pred[0], (np.integer, int)): return np.array(y_pred) # list of lists of probabilities in the format [[0.1], [0.2], ...] elif len(y_pred[0]) == 1: return np.array([i[0] for i in y_pred]) # list of lists of probabilities in the format [[0.1, 0.9], [0.2, 0.8], ...] elif len(y_pred[0]) == 2: return np.array([i[1] for i in y_pred]) elif len(y_pred[0]) > 2: return np.array([np.argmax(i) for i in y_pred]) else: raise ValueError("Unknown format for y_pred!")