Source code for deepmol.loaders.loaders

"""
Classes for processing input data into a format suitable for machine learning.
"""

from typing import Optional, List

from deepmol.datasets import SmilesDataset
import numpy as np
import pandas as pd

from deepmol.loaders._utils import load_csv_file, load_sdf_file


[docs]class CSVLoader(object): """ A Loader to directly read data from a CSV. Assumes a coma separated with header file! """ def __init__(self, dataset_path: str, smiles_field: str, id_field: str = None, labels_fields: List[str] = None, features_fields: List[str] = None, shard_size: int = None, mode: str = 'auto') -> None: """ Initialize the CSVLoader. Parameters ---------- dataset_path: str path to the dataset file smiles_field: str field containing the molecules' id_field: str field containing the ids labels_fields: List[str] field containing the labels features_fields: List[str] field containing the features shard_size: int size of the shard to load mode: str The mode of the dataset. If 'auto', the mode is inferred from the labels. If 'classification', the dataset is treated as a classification dataset. If 'regression', the dataset is treated as a regression dataset. """ self.dataset_path = dataset_path self.mols_field = smiles_field self.id_field = id_field self.labels_fields = labels_fields self.features_fields = features_fields self.shard_size = shard_size fields2keep = [smiles_field] if id_field is not None: fields2keep.append(id_field) if labels_fields is not None: fields2keep.extend(labels_fields) if features_fields is not None: fields2keep.extend(features_fields) self.fields2keep = fields2keep self.mode = mode @staticmethod def _get_dataset(dataset_path: str, fields: List[str] = None, chunk_size: int = None, **kwargs) -> pd.DataFrame: """ Loads data with size chunk_size. Parameters ---------- dataset_path: str path to the dataset file fields: List[str] fields to keep chunk_size: int size of the shard to load kwargs: Keyword arguments to pass to pandas.read_csv. Returns ------- pd.DataFrame Dataframe with chunk size. """ return load_csv_file(dataset_path, fields, chunk_size, **kwargs)
[docs] def create_dataset(self, **kwargs) -> SmilesDataset: """ Creates a dataset from the CSV file. Parameters ---------- kwargs: Keyword arguments to pass to pandas.read_csv. Returns ------- SmilesDataset Dataset with the data. """ dataset = self._get_dataset(self.dataset_path, fields=self.fields2keep, chunk_size=self.shard_size, **kwargs) mols = dataset[self.mols_field].to_numpy() if self.features_fields is not None: if len(self.features_fields) == 1: X = dataset[self.features_fields[0]].to_numpy() else: X = dataset[self.features_fields].to_numpy() else: X = None if self.labels_fields is not None: if len(self.labels_fields) == 1: y = dataset[self.labels_fields[0]].to_numpy() else: y = dataset[self.labels_fields].to_numpy() else: y = None if self.id_field is not None: ids = dataset[self.id_field].to_numpy() else: ids = None return SmilesDataset(smiles=mols, X=X, y=y, ids=ids, feature_names=self.features_fields, label_names=self.labels_fields, mode=self.mode)
[docs]class SDFLoader(object): """ A Loader to directly read data from a SDF. """ def __init__(self, dataset_path: str, id_field: str = None, labels_fields: List[str] = None, features_fields: List[str] = None, shard_size: Optional[int] = None, mode: str = 'auto') -> None: """ Initialize the SDFLoader. Parameters ---------- dataset_path: str path to the dataset file id_field: str field containing the ids labels_fields: List[str] field containing the labels features_fields: List[str] field containing the features shard_size: int size of the shard to load mode: str The mode of the dataset. If 'auto', the mode is inferred from the labels. If 'classification', the dataset is treated as a classification dataset. If 'regression', the dataset is treated as a regression dataset. """ self.dataset_path = dataset_path self.id_field = id_field self.labels_fields = labels_fields self.features_fields = features_fields self.shard_size = shard_size fields2keep = [] if id_field is not None: fields2keep.append(id_field) if labels_fields is not None: fields2keep.extend(labels_fields) if features_fields is not None: fields2keep.extend(features_fields) self.fields2keep = fields2keep self.mode = mode @staticmethod def _get_dataset(dataset_path: str, chunk_size: int = None) -> np.ndarray: """ Loads data from path. Parameters ---------- dataset_path: str Filename to process chunk_size: int size of the shard to load Returns ------- pd.DataFrame Dataframe """ return load_sdf_file(dataset_path, chunk_size)
[docs] def create_dataset(self) -> SmilesDataset: """ Creates a dataset from the SDF file. Returns ------- SmilesDataset Dataset with the data. """ molecules = self._get_dataset(self.dataset_path, self.shard_size) X = [] y = [] ids = [] mols = [] for mol in molecules: mols.append(mol) mol_feature = [] mol_ys = [] if self.features_fields is not None: for feature in self.features_fields: mol_feature.append(mol.GetProp(feature)) if len(mol_feature) == 1: mol_feature = mol_feature[0] X.append(mol_feature) if self.labels_fields is not None: for label in self.labels_fields: mol_ys.append(float(mol.GetProp(label))) if len(mol_ys) == 1: mol_ys = mol_ys[0] y.append(mol_ys) else: mol_y = None y.append(mol_y) if self.id_field is not None: mol_id = mol.GetProp(self.id_field) ids.append(mol_id) else: mol_id = None ids.append(mol_id) X = np.array(X) if X is not None and len(X) != 0 else None y = None if len(set(np.array(y).flatten())) == 1 and np.array(y).flatten()[0] is None else np.array(y) ids = np.array(ids) if len(set(ids)) == len(ids) else None mols = np.array(mols) feature_names = self.features_fields return SmilesDataset.from_mols(mols=mols, X=X, y=y, ids=ids, feature_names=feature_names, label_names=self.labels_fields, mode=self.mode)