Source code for deepmol.models.ensembles

from abc import ABC, abstractmethod
from typing import List

import numpy as np

from deepmol.datasets import Dataset
from deepmol.evaluator.evaluator import Evaluator
from deepmol.metrics.metrics import Metric
from deepmol.models.models import Model


[docs]class Ensemble(ABC):
    """
    Abstract class for ensembles of models.
    """

    def __init__(self, models: List[Model]):
        """
        Initializes an ensemble of models.

        Parameters
        ----------
        models: List[Model]
            List of models to be used in the ensemble.
        """
        self.models = models

[docs]    def fit(self, dataset: Dataset):
        """
        Fits the models to the specified dataset.
        """
        for model in self.models:
            model.fit(dataset)

[docs]    @abstractmethod
    def predict(self, dataset: Dataset):
        """
        Predicts the labels for the specified dataset.
        """

[docs]    def evaluate(self,
                 dataset: Dataset,
                 metrics: List[Metric],
                 per_task_metrics: bool = False,
                 n_classes: int = 2):
        """
        Evaluates the performance of this model on specified dataset.

        Parameters
        ----------
        dataset: Dataset
            Dataset object.
        metrics: List[Metric]
            The set of metrics provided.
        per_task_metrics: bool
            If true, return computed metric for each task on multitask dataset.
        n_classes: int
            If specified, will use `n_classes` as the number of unique classes.

        Returns
        -------
        multitask_scores: dict
            Dictionary mapping names of metrics to metric scores.
        all_task_scores: dict, optional
            If `per_task_metrics == True` is passed as a keyword argument, then returns a second dictionary of scores
            for each task separately.
        """
        evaluator = Evaluator(self, dataset)
        return evaluator.compute_model_performance(metrics,
                                                   per_task_metrics=per_task_metrics,
                                                   n_classes=n_classes)


[docs]class VotingClassifier(Ensemble):
    """
    VotingClassifier Ensemble.
    It uses a voting strategy to predict the labels of a dataset.
    """

    def __init__(self, models: List[Model], voting: str = "soft"):
        """
        Initializes a VotingClassifier ensemble.

        Parameters
        ----------
        models: List[Model]
            List of models to be used in the ensemble.
        voting: str
            Voting strategy to use. Can be either 'soft' or 'hard'.
        """
        super().__init__(models)
        self.voting = voting

[docs]    def predict(self, dataset: Dataset, proba: bool = False):
        """
        Predicts the labels for the specified dataset.

        Parameters
        ----------
        dataset: Dataset
            Dataset object.
        proba: bool
            If true, returns the probabilities instead of class labels.

        Returns
        -------
        final_result: np.ndarray
            Predicted labels or probabilities.
        """
        assert len(self.models) > 0

        n_labels = len(np.unique(dataset.y))
        results_from_all_models = np.empty(shape=(len(dataset.mols), n_labels, len(self.models)))

        for i, model in enumerate(self.models):
            model_y_predicted = model.predict(dataset)

            for j in range(len(model_y_predicted)):
                for prediction_i, prediction in enumerate(model_y_predicted[j]):
                    results_from_all_models[j, prediction_i, i] = model_y_predicted[j, prediction_i]

        if proba:
            final_result = np.empty(shape=(len(dataset.mols), n_labels))
        else:
            final_result = np.empty(shape=(len(dataset.mols)))

        if self.voting == "soft":

            for mol_i, mol_predictions in enumerate(results_from_all_models):
                class_predictions = np.apply_along_axis(np.mean, 1, mol_predictions)

                if proba:
                    final_result[mol_i] = class_predictions
                else:
                    max_prediction = 0
                    max_prediction_class = 0
                    for i, class_prediction in enumerate(class_predictions):
                        if class_prediction > max_prediction:
                            max_prediction_class = i
                            max_prediction = class_prediction
                    final_result[mol_i] = max_prediction_class

            return final_result

        elif self.voting == "hard":
            for mol_i, mol_predictions in enumerate(results_from_all_models):
                predictions_counter = {}
                for i, models_class_predictions in enumerate(mol_predictions):
                    for model_class_prediction in models_class_predictions:
                        if model_class_prediction > 0.5:

                            if i in predictions_counter:
                                predictions_counter[i].append(model_class_prediction)
                            else:
                                predictions_counter[i] = [model_class_prediction]

                class_with_more_predictions = None
                max_n_predictions = 0

                for class_ in predictions_counter:
                    len_predictions_counter = len(predictions_counter[class_])
                    if len_predictions_counter > max_n_predictions:
                        max_n_predictions = len_predictions_counter
                        class_with_more_predictions = class_

                assert class_with_more_predictions is not None

                final_result[mol_i] = class_with_more_predictions

        else:
            raise Exception("Voting has to be either 'soft' or 'hard'")

        return final_result