Source code for deepmol.parameter_optimization.deepchem_hyperparameter_optimization

"""Hyperparameter Optimization Classes for DeepchemModel models"""
from collections import defaultdict
from typing import Union, List

import numpy as np
from sklearn.model_selection import ParameterGrid, ParameterSampler

from deepmol.datasets import Dataset
from deepmol.metrics import Metric
from deepmol.splitters import SingletaskStratifiedSplitter, RandomSplitter


# TODO: it would probably be better if we tried to create a scikit-learn wrapper for DeepchemModel models,
#  similar to KerasRegressor and KerasClassifier (not sure it would work though)
[docs]class DeepchemBaseSearchCV(object): """ Base class for hyperparameter search with cross-validation for DeepChemModels. """ def __init__(self, model_build_fn: callable, param_grid: Union[dict, ParameterGrid, ParameterSampler], scoring: Union[Metric, List[Metric]], maximize: bool, refit: bool, cv: int, mode: str, random_state: int = None, return_train_score: bool = False): """ Initialize the hyperparameter search. Parameters ---------- model_build_fn: callable A function that builds a DeepchemModel model. param_grid: dict The hyperparameter grid to search. scoring: Union[Metric, List[Metric]] The metrics to use for scoring. maximize: bool If True, maximize the metric. If False, minimize the metric. refit: bool If True, refit the best model on the whole dataset. cv: int The number of folds for cross-validation. mode: str The mode of the model. Can be 'classification' or 'regression'. random_state: int The random state to use for the cross-validation. return_train_score: bool If True, return the training scores. """ self.build_fn = model_build_fn self.param_grid = param_grid self.metric = scoring self.maximize = maximize self.mode = mode self.refit = refit self.cv = cv self.random_state = random_state self.return_train_score = return_train_score self.best_score_ = None self.best_params_ = {} self.best_estimator_ = None self.cv_results_ = None
[docs] def fit(self, dataset: Dataset): """ Run hyperparameter search with cross-validation. Parameters ---------- dataset: Dataset The dataset to use for the hyperparameter search. """ if self.mode != dataset.mode: raise ValueError(f'The mode of the model and the dataset must be the same. Got {self.mode} and ' f'{dataset.mode} respectively.') results_dict = defaultdict(list) # split dataset into folds if dataset.mode == 'classification': splitter = SingletaskStratifiedSplitter() else: splitter = RandomSplitter() datasets = splitter.k_fold_split(dataset, k=self.cv, seed=self.random_state) for param_combination in self.param_grid: results_dict['params'].append(param_combination) # Cross-validation: train_scores = [] test_scores = [] for train_dataset, test_dataset in datasets: model = self.build_fn(**param_combination) # creates a new DeepchemModel model.fit(train_dataset) train_score, _ = model.evaluate(train_dataset, [self.metric]) train_scores.append(train_score[self.metric.name]) test_score, _ = model.evaluate(test_dataset, [self.metric]) test_scores.append(test_score[self.metric.name]) results_dict['mean_train_score'].append(np.mean(train_scores)) mean_test_score = np.mean(test_scores) results_dict['mean_test_score'].append(mean_test_score) results_dict['std_train_score'].append(np.std(train_scores)) results_dict['std_test_score'].append(np.std(test_scores)) for i, (train_score, test_score) in enumerate(zip(train_scores, test_scores)): train_key = 'split%s_train_score' % str(i) test_key = 'split%s_test_score' % str(i) results_dict[train_key].append(train_score) results_dict[test_key].append(test_score) if self.maximize: if (self.best_score_ is None) or (mean_test_score > self.best_score_): self.best_score_ = mean_test_score self.best_params_ = param_combination else: if (self.best_score_ is None) or (mean_test_score < self.best_score_): self.best_score_ = mean_test_score self.best_params_ = param_combination self.cv_results_ = results_dict self.best_estimator_ = self.build_fn(**self.best_params_) if self.refit: self.best_estimator_.fit(dataset)
[docs]class DeepchemGridSearchCV(DeepchemBaseSearchCV): """ Hyperparameter search with cross-validation for DeepChemModels using a grid search. """ def __init__(self, model_build_fn: callable, param_grid: Union[dict, ParameterGrid], scoring: Union[Metric, List[Metric]], maximize: bool, refit: bool, cv: int, mode: str, random_state: int = None, return_train_score: bool = False): """ Initialize the hyperparameter search. Parameters ---------- model_build_fn: callable A function that builds a DeepchemModel model. param_grid: Union[dict, ParameterGrid] The hyperparameter grid to search. scoring: Union[Metric, List[Metric]] The metric to use for scoring. maximize: bool If True, maximize the metric. If False, minimize the metric. refit: bool If True, refit the best model on the whole dataset. cv: int The number of folds for cross-validation. mode: str The mode of the model. Can be 'classification' or 'regression'. random_state: int The random state to use for the cross-validation. return_train_score: bool If True, return the training scores. """ self.param_grid = ParameterGrid(param_grid) super().__init__(model_build_fn=model_build_fn, param_grid=self.param_grid, scoring=scoring, maximize=maximize, refit=refit, cv=cv, mode=mode, random_state=random_state, return_train_score=return_train_score)
[docs]class DeepchemRandomSearchCV(DeepchemBaseSearchCV): """ Hyperparameter search with cross-validation for DeepChemModels using a random search. """ def __init__(self, model_build_fn: callable, param_distributions: Union[dict, ParameterSampler], scoring: Union[Metric, List[Metric]], maximize: bool, refit: bool, cv: int, mode: str, random_state: int = None, return_train_score: bool = False, n_iter: int = 20): """ Initialize the hyperparameter search. Parameters ---------- model_build_fn: callable A function that builds a DeepchemModel model. param_distributions: Union[dict, ParameterSampler] The hyperparameter sampler to search. scoring: Union[Metric, List[Metric]] The metrics to use for scoring. maximize: bool If True, maximize the metric. If False, minimize the metric. refit: bool If True, refit the best model on the whole dataset. cv: int The number of folds for cross-validation. mode: str The mode of the model. Can be 'classification' or 'regression'. random_state: int The random state to use for the cross-validation. return_train_score: bool If True, return the training scores. n_iter: int The number of iterations to perform. """ self.param_grid = ParameterSampler(param_distributions, n_iter, random_state=random_state) super().__init__(model_build_fn=model_build_fn, param_grid=self.param_grid, scoring=scoring, maximize=maximize, refit=refit, cv=cv, mode=mode, random_state=random_state, return_train_score=return_train_score)