Source code for deepmol.models.sklearn_models

from typing import Sequence

import numpy as np
from sklearn.base import BaseEstimator

from deepmol.models._utils import save_to_disk
from deepmol.models.models import Model
from deepmol.datasets import Dataset
from deepmol.splitters.splitters import RandomSplitter, SingletaskStratifiedSplitter
from deepmol.metrics.metrics import Metric

from deepmol.utils.utils import load_from_disk

from sklearn.base import clone


[docs]class SklearnModel(Model): """ Wrapper class that wraps scikit-learn models. The `SklearnModel` class provides a wrapper around scikit-learn models that allows scikit-learn models to be trained on `Dataset` objects and evaluated with the metrics in Metrics. """ def __init__(self, model: BaseEstimator, mode: str = None, model_dir: str = None, **kwargs): """ Initializes a `SklearnModel` object. Parameters ---------- model: BaseEstimator The model instance which inherits a scikit-learn `BaseEstimator` Class. mode: str 'classification' or 'regression' model_dir: str If specified the model will be stored in this directory. Else, a temporary directory will be used. kwargs: dict Additional keyword arguments. """ super().__init__(model, model_dir, **kwargs) self.mode = mode self.model_type = 'sklearn'
[docs] def fit_on_batch(self, X: Sequence, y: Sequence): """ Fits model on batch of data. """
[docs] def get_task_type(self) -> str: """ Returns the task type of the model. """
[docs] def get_num_tasks(self) -> int: """ Returns the number of tasks. """
[docs] def fit(self, dataset: Dataset) -> None: """ Fits scikit-learn model to data. Parameters ---------- dataset: Dataset The `Dataset` to train this model on. Returns ------- BaseEstimator The trained scikit-learn model. """ if self.mode is not None and self.mode != dataset.mode: raise ValueError(f'The mode of the dataset must match the mode of the model. ' f'Got {dataset.mode} for dataset and {self.mode} for model.') features = dataset.X y = np.squeeze(dataset.y) return self.model.fit(features, y)
[docs] def predict(self, dataset: Dataset) -> np.ndarray: """ Makes predictions on dataset. Parameters ---------- dataset: Dataset Dataset to make prediction on. Returns ------- np.ndarray The value is a return value of `predict_proba` or `predict` method of the scikit-learn model. If the scikit-learn model has both methods, the value is always a return value of `predict_proba`. """ try: return self.model.predict_proba(dataset.X) except AttributeError: return self.model.predict(dataset.X)
[docs] def predict_on_batch(self, dataset: Dataset) -> np.ndarray: """ Makes predictions on batch of data. Parameters ---------- dataset: Dataset Dataset to make prediction on. Returns ------- np.ndarray numpy array of predictions. """ return super(SklearnModel, self).predict(dataset)
[docs] def save(self): """ Saves scikit-learn model to disk using joblib. """ save_to_disk(self.model, self.get_model_filename(self.model_dir))
[docs] def reload(self): """ Loads scikit-learn model from joblib file on disk. """ self.model = load_from_disk(self.get_model_filename(self.model_dir))
[docs] def cross_validate(self, dataset: Dataset, metric: Metric, folds: int = 3): """ Performs cross-validation on a dataset. Parameters ---------- dataset: Dataset Dataset to perform cross-validation on. metric: Metric Metric to evaluate model performance. folds: int Number of folds to use for cross-validation. Returns ------- Tuple[SKlearnModel, float, float, List[float], List[float], float, float] The first element is the best model, the second is the train score of the best model, the third is the train score of the best model, the fourth is the test scores of all models, the fifth is the average train scores of all folds and the sixth is the average test score of all folds. """ # TODO: add option to choose between splitters if dataset.mode == 'classification': splitter = SingletaskStratifiedSplitter() datasets = splitter.k_fold_split(dataset, folds) elif dataset.mode == 'regression': splitter = RandomSplitter() datasets = splitter.k_fold_split(dataset, folds) else: try: splitter = SingletaskStratifiedSplitter() datasets = splitter.k_fold_split(dataset, folds) except Exception as e: splitter = RandomSplitter() datasets = splitter.k_fold_split(dataset, folds) train_scores = [] train_score_best_model = 0 avg_train_score = 0 test_scores = [] test_score_best_model = 0 avg_test_score = 0 best_model = None split = 1 for train_ds, test_ds in datasets: split += 1 dummy_model = clone(SklearnModel(model=self.model)) dummy_model.fit(train_ds) train_score = dummy_model.evaluate(train_ds, metric)[0] train_scores.append(train_score[metric.name]) avg_train_score += train_score[metric.name] test_score = dummy_model.evaluate(test_ds, metric)[0] test_scores.append(test_score[metric.name]) avg_test_score += test_score[metric.name] if test_score[metric.name] > test_score_best_model: test_score_best_model = test_score[metric.name] train_score_best_model = train_score[metric.name] best_model = dummy_model return best_model, train_score_best_model, test_score_best_model, train_scores, test_scores, avg_train_score / folds, avg_test_score / folds