from deepmol.models.models import Model
from deepmol.models.sklearn_models import SklearnModel
from deepmol.metrics.metrics import Metric
from deepmol.splitters.splitters import RandomSplitter, SingletaskStratifiedSplitter
from typing import Sequence
import numpy as np
from deepmol.datasets import Dataset
from keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
from sklearn.base import clone
# Only for sequential single input models
[docs]class KerasModel(Model):
"""
Wrapper class that wraps keras models.
The `KerasModel` class provides a wrapper around keras models that allows this models to be trained on `Dataset`
objects.
"""
def __init__(self,
model_builder: callable,
mode: str = 'classification',
model_dir: str = None,
loss: str = 'binary_crossentropy',
optimizer: str = 'adam',
learning_rate: float = 0.001,
epochs: int = 150,
batch_size: int = 10,
verbose: int = 0,
**kwargs) -> None:
"""
Initializes a `KerasModel` object.
Parameters
----------
model_builder: callable
A function that builds a keras model.
mode: str
The mode of the model. Can be either 'classification' or 'regression'.
model_dir: str
The directory to save the model to.
loss: str
The loss function to use.
optimizer: str
The optimizer to use.
learning_rate: float
The learning rate to use.
epochs: int
The number of epochs to train for.
batch_size: int
The batch size to use.
verbose: int
The verbosity of the model.
"""
super().__init__(model_builder, model_dir, **kwargs)
self.mode = mode
self.loss = loss
self.optimizer = optimizer
self.learning_rate = learning_rate
self.model_type = 'keras'
self.batch_size = batch_size
self.epochs = epochs
self.model_builder = model_builder
self.verbose = verbose
if mode == 'classification':
self.model = KerasClassifier(build_fn=model_builder, epochs=epochs, batch_size=batch_size,
verbose=verbose, **kwargs)
elif mode == 'regression':
self.model = KerasRegressor(build_fn=model_builder, nb_epoch=epochs, batch_size=batch_size, verbose=verbose,
**kwargs)
else:
self.model = model_builder
[docs] def fit(self, dataset: Dataset, **kwargs) -> None:
"""
Fits keras model to data.
Parameters
----------
dataset: Dataset
The `Dataset` to train this model on.
kwargs:
Additional arguments to pass to `fit` method of the keras model.
"""
if self.mode != dataset.mode:
raise ValueError('Dataset mode does not match model mode.')
features = dataset.X.astype('float32')
if len(dataset.label_names) == 1:
y = np.squeeze(dataset.y)
else:
targets = [dataset.y[:, i] for i in range(len(dataset.label_names))]
y = {f"{dataset.label_names[i]}": targets[i] for i in range(len(dataset.label_names))}
self.model.fit(features, y, **kwargs)
[docs] def predict(self, dataset: Dataset) -> np.ndarray:
"""
Makes predictions on dataset.
Parameters
----------
dataset: Dataset
Dataset to make prediction on.
Returns
-------
np.ndarray
The value is a return value of `predict_proba` or `predict` method of the scikit-learn model. If the
scikit-learn model has both methods, the value is always a return value of `predict_proba`.
"""
try:
return self.model.predict_proba(dataset.X.astype('float32'))
except AttributeError:
self.logger.info(str(self.model))
self.logger.info(str(type(self.model)))
return self.model.predict(dataset.X.astype('float32'))
[docs] def predict_on_batch(self, X: Dataset) -> np.ndarray:
"""
Makes predictions on batch of data.
Parameters
----------
X: Dataset
Dataset to make prediction on.
Returns
-------
np.ndarray
numpy array of predictions.
"""
return super(KerasModel, self).predict(X)
[docs] def fit_on_batch(self, X: Sequence, y: Sequence):
"""
Fits model on batch of data.
"""
[docs] def reload(self) -> None:
"""
Reloads the model from disk.
"""
[docs] def save(self) -> None:
"""
Saves the model to disk.
"""
[docs] def get_task_type(self) -> str:
"""
Returns the task type of the model.
"""
[docs] def get_num_tasks(self) -> int:
"""
Returns the number of tasks of the model.
"""
[docs] def cross_validate(self,
dataset: Dataset,
metric: Metric,
folds: int = 3):
"""
Cross validates the model on a dataset.
Parameters
----------
dataset: Dataset
The `Dataset` to cross validate on.
metric: Metric
The metric to use for cross validation.
folds: int
The number of folds to use for cross validation.
Returns
-------
Tuple[SKlearnModel, float, float, List[float], List[float], float, float]
The first element is the best model, the second is the train score of the best model, the third is the train
score of the best model, the fourth is the test scores of all models, the fifth is the average train scores
of all folds and the sixth is the average test score of all folds.
"""
# TODO: add option to choose between splitters
splitter = None
if dataset.mode == 'classification':
splitter = SingletaskStratifiedSplitter()
if dataset.mode == 'regression':
splitter = RandomSplitter()
assert splitter is not None
datasets = splitter.k_fold_split(dataset, folds)
train_scores = []
train_score_best_model = 0
avg_train_score = 0
test_scores = []
test_score_best_model = 0
avg_test_score = 0
best_model = None
for train_ds, test_ds in datasets:
dummy_model = clone(SklearnModel(model=self.model))
dummy_model.fit(train_ds)
train_score = dummy_model.evaluate(train_ds, metric)
train_scores.append(train_score[metric.name])
avg_train_score += train_score[metric.name]
test_score = dummy_model.evaluate(test_ds, metric)
test_scores.append(test_score[metric.name])
avg_test_score += test_score[metric.name]
if test_score[metric.name] > test_score_best_model:
test_score_best_model = test_score[metric.name]
train_score_best_model = train_score[metric.name]
best_model = dummy_model
return best_model, train_score_best_model, test_score_best_model, train_scores, test_scores, avg_train_score / folds, avg_test_score / folds