Source code for deepmol.parameter_optimization.deepchem_hyperparameter_optimization

"""Hyperparameter Optimization Classes for DeepchemModel models"""
from collections import defaultdict
from typing import Union, List

import numpy as np
from sklearn.model_selection import ParameterGrid, ParameterSampler

from deepmol.datasets import Dataset
from deepmol.metrics import Metric
from deepmol.splitters import SingletaskStratifiedSplitter, RandomSplitter


# TODO: it would probably be better if we tried to create a scikit-learn wrapper for DeepchemModel models,
#  similar to KerasRegressor and KerasClassifier (not sure it would work though)
[docs]class DeepchemBaseSearchCV(object):
    """
    Base class for hyperparameter search with cross-validation for DeepChemModels.
    """

    def __init__(self,
                 model_build_fn: callable,
                 param_grid: Union[dict, ParameterGrid, ParameterSampler],
                 scoring: Union[Metric, List[Metric]],
                 maximize: bool,
                 refit: bool,
                 cv: int,
                 mode: str,
                 random_state: int = None,
                 return_train_score: bool = False):
        """
        Initialize the hyperparameter search.

        Parameters
        ----------
        model_build_fn: callable
            A function that builds a DeepchemModel model.
        param_grid: dict
            The hyperparameter grid to search.
        scoring: Union[Metric, List[Metric]]
            The metrics to use for scoring.
        maximize: bool
            If True, maximize the metric. If False, minimize the metric.
        refit: bool
            If True, refit the best model on the whole dataset.
        cv: int
            The number of folds for cross-validation.
        mode: str
            The mode of the model. Can be 'classification' or 'regression'.
        random_state: int
            The random state to use for the cross-validation.
        return_train_score: bool
            If True, return the training scores.
        """
        self.build_fn = model_build_fn
        self.param_grid = param_grid
        self.metric = scoring
        self.maximize = maximize
        self.mode = mode
        self.refit = refit
        self.cv = cv
        self.random_state = random_state
        self.return_train_score = return_train_score

        self.best_score_ = None
        self.best_params_ = {}
        self.best_estimator_ = None
        self.cv_results_ = None

[docs]    def fit(self, dataset: Dataset):
        """
        Run hyperparameter search with cross-validation.

        Parameters
        ----------
        dataset: Dataset
            The dataset to use for the hyperparameter search.
        """
        if self.mode != dataset.mode:
            raise ValueError(f'The mode of the model and the dataset must be the same. Got {self.mode} and '
                             f'{dataset.mode} respectively.')
        results_dict = defaultdict(list)

        # split dataset into folds
        if dataset.mode == 'classification':
            splitter = SingletaskStratifiedSplitter()
        else:
            splitter = RandomSplitter()

        datasets = splitter.k_fold_split(dataset, k=self.cv, seed=self.random_state)
        for param_combination in self.param_grid:
            results_dict['params'].append(param_combination)

            # Cross-validation:
            train_scores = []
            test_scores = []
            for train_dataset, test_dataset in datasets:
                model = self.build_fn(**param_combination)  # creates a new DeepchemModel
                model.fit(train_dataset)
                train_score, _ = model.evaluate(train_dataset, [self.metric])
                train_scores.append(train_score[self.metric.name])
                test_score, _ = model.evaluate(test_dataset, [self.metric])
                test_scores.append(test_score[self.metric.name])

            results_dict['mean_train_score'].append(np.mean(train_scores))
            mean_test_score = np.mean(test_scores)
            results_dict['mean_test_score'].append(mean_test_score)
            results_dict['std_train_score'].append(np.std(train_scores))
            results_dict['std_test_score'].append(np.std(test_scores))
            for i, (train_score, test_score) in enumerate(zip(train_scores, test_scores)):
                train_key = 'split%s_train_score' % str(i)
                test_key = 'split%s_test_score' % str(i)
                results_dict[train_key].append(train_score)
                results_dict[test_key].append(test_score)

            if self.maximize:
                if (self.best_score_ is None) or (mean_test_score > self.best_score_):
                    self.best_score_ = mean_test_score
                    self.best_params_ = param_combination
            else:
                if (self.best_score_ is None) or (mean_test_score < self.best_score_):
                    self.best_score_ = mean_test_score
                    self.best_params_ = param_combination

        self.cv_results_ = results_dict
        self.best_estimator_ = self.build_fn(**self.best_params_)

        if self.refit:
            self.best_estimator_.fit(dataset)


[docs]class DeepchemGridSearchCV(DeepchemBaseSearchCV):
    """
    Hyperparameter search with cross-validation for DeepChemModels using a grid search.
    """

    def __init__(self,
                 model_build_fn: callable,
                 param_grid: Union[dict, ParameterGrid],
                 scoring: Union[Metric, List[Metric]],
                 maximize: bool,
                 refit: bool,
                 cv: int,
                 mode: str,
                 random_state: int = None,
                 return_train_score: bool = False):
        """
        Initialize the hyperparameter search.

        Parameters
        ----------
        model_build_fn: callable
            A function that builds a DeepchemModel model.
        param_grid: Union[dict, ParameterGrid]
            The hyperparameter grid to search.
        scoring: Union[Metric, List[Metric]]
            The metric to use for scoring.
        maximize: bool
            If True, maximize the metric. If False, minimize the metric.
        refit: bool
            If True, refit the best model on the whole dataset.
        cv: int
            The number of folds for cross-validation.
        mode: str
            The mode of the model. Can be 'classification' or 'regression'.
        random_state: int
            The random state to use for the cross-validation.
        return_train_score: bool
            If True, return the training scores.
        """
        self.param_grid = ParameterGrid(param_grid)
        super().__init__(model_build_fn=model_build_fn, param_grid=self.param_grid, scoring=scoring, maximize=maximize,
                         refit=refit, cv=cv, mode=mode, random_state=random_state, return_train_score=return_train_score)


[docs]class DeepchemRandomSearchCV(DeepchemBaseSearchCV):
    """
    Hyperparameter search with cross-validation for DeepChemModels using a random search.
    """

    def __init__(self,
                 model_build_fn: callable,
                 param_distributions: Union[dict, ParameterSampler],
                 scoring: Union[Metric, List[Metric]],
                 maximize: bool,
                 refit: bool,
                 cv: int,
                 mode: str,
                 random_state: int = None,
                 return_train_score: bool = False,
                 n_iter: int = 20):
        """
        Initialize the hyperparameter search.

        Parameters
        ----------
        model_build_fn: callable
            A function that builds a DeepchemModel model.
        param_distributions: Union[dict, ParameterSampler]
            The hyperparameter sampler to search.
        scoring: Union[Metric, List[Metric]]
            The metrics to use for scoring.
        maximize: bool
            If True, maximize the metric. If False, minimize the metric.
        refit: bool
            If True, refit the best model on the whole dataset.
        cv: int
            The number of folds for cross-validation.
        mode: str
            The mode of the model. Can be 'classification' or 'regression'.
        random_state: int
            The random state to use for the cross-validation.
        return_train_score: bool
            If True, return the training scores.
        n_iter: int
            The number of iterations to perform.
        """
        self.param_grid = ParameterSampler(param_distributions, n_iter, random_state=random_state)
        super().__init__(model_build_fn=model_build_fn, param_grid=self.param_grid, scoring=scoring, maximize=maximize,
                         refit=refit, cv=cv, mode=mode, random_state=random_state, return_train_score=return_train_score)