Source code for deepmol.feature_selection.base_feature_selector

from abc import ABC
from typing import Union, Iterable

import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, SelectKBest, SelectPercentile, RFECV, SelectFromModel, \
    f_classif

from deepmol.base import Transformer
from deepmol.datasets import Dataset
from deepmol.utils.decorators import modify_object_inplace_decorator


[docs]class BaseFeatureSelector(ABC, Transformer):
    """
    Abstract class for feature selection.
    A `BaseFeatureSelector` uses features present in a Dataset object to select the most important ones.
    FeatureSelectors which are subclasses of this class should always operate over Dataset Objects.
    """

    def __init__(self, feature_selector):
        """
        Initialize the feature selector.
        """
        if self.__class__ == BaseFeatureSelector:
            raise Exception('Abstract class BaseFeatureSelector should not be instantiated')
        super().__init__()
        self.feature_selector = feature_selector
        self.features_to_keep = None

    @modify_object_inplace_decorator
    def select_features(self, dataset: Dataset) -> Dataset:
        """
        Perform feature selection for the molecules present in the dataset.

        Parameters
        ----------
        dataset: Dataset
            Dataset to perform feature selection on
        inplace: bool
            Whether to perform the feature selection in the received dataset or not.

        Returns
        -------
        dataset: Dataset
          Dataset containing the selected features and indexes of the features kept as 'self.features2keep'.
        """
        return self.fit_transform(dataset)

    def _transform(self, dataset: Dataset) -> Dataset:
        """
        Perform feature selection for the molecules present in the dataset.

        Parameters
        ----------
        dataset: Dataset
            Dataset to perform feature selection on

        Returns
        -------
        dataset: Dataset
          Dataset containing the selected features and indexes of the features kept as 'self.features2keep'.
        """
        if self.features_to_keep is not None:
            dataset = dataset.select_features_by_index(list(self.features_to_keep))
        return dataset

    def _fit(self, dataset: Dataset) -> 'BaseFeatureSelector':
        """
        Fits the feature selector to a dataset of molecules.

        Parameters
        ----------
        dataset: Dataset
            Dataset of molecules.

        Returns
        -------
        self: BaseFeatureSelector
            The fitted feature selector.
        """
        x = np.stack(dataset.X, axis=0)
        y = np.array(dataset.y)
        fs = self.feature_selector.fit(x, y)
        self.features_to_keep = fs.get_support(indices=True)
        return self


[docs]class LowVarianceFS(BaseFeatureSelector):
    """
    Class for Low Variance feature selection.
    Feature selector that removes all features with low-variance.
    """

    def __init__(self, threshold: float = 0.3):
        """
        Initialize this Feature Selector

        Parameters
        ----------
        threshold: float
            Features with a training-set variance lower than this threshold will be removed.
        """
        self.param = threshold
        super().__init__(VarianceThreshold(threshold=threshold))


[docs]class KbestFS(BaseFeatureSelector):
    """
    Class for K best feature selection.

    Select features according to the k-highest scores.
    """

    def __init__(self, k: int = 10, score_func: callable = f_classif):
        """
        Initialize this KbestFS Feature Selector.

        Parameters
        ----------
        k: int
            Number of top features to select.
        score_func: callable
            Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with
            scores.
        """
        super().__init__(SelectKBest(score_func=score_func, k=k))


[docs]class PercentilFS(BaseFeatureSelector):
    """
    Class for percentil feature selection.

    Select features according to a percentile of the highest scores.
    """

    def __init__(self, percentil: int = 10, score_func: callable = f_classif):
        """
        Initialize the PercentilFS Feature Selector.

        Parameters
        ----------
        percentil: int
            Percent of features to keep.
        score_func: callable
            Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with
            scores.
        """
        super().__init__(SelectPercentile(score_func=score_func, percentile=percentil))


# TODO: takes too long to run, check if its normal or a code problem
[docs]class RFECVFS(BaseFeatureSelector):
    """
    Class for RFECV feature selection.

    Feature ranking with recursive feature elimination and cross-validated selection of the best number of features.
    """

    def __init__(self,
                 estimator: callable = None,
                 step: Union[int, float] = 1,
                 min_features_to_select: int = 1,
                 cv: Union[int, callable, Iterable] = None,
                 scoring: Union[str, callable] = None,
                 verbose: int = 0,
                 n_jobs: int = -1):
        """
        Initialize the RFECVFS Feature Selector.

        Parameters
        ----------
        estimator: callable
            A supervised learning estimator with a fit method that provides information about feature importance either
            through a coef_ attribute or through a feature_importances_ attribute.
        step: Union[int, float]
            If greater than or equal to 1, then step corresponds to the (integer) number of features to remove
            at each iteration. If within (0.0, 1.0), then step corresponds to the percentage (rounded down) of
            features to remove at each iteration. Note that the last iteration may remove fewer than step features
            in order to reach min_features_to_select.
        min_features_to_select: int
            The minimum number of features to be selected. This number of features will always be scored, even if
            the difference between the original feature count and min_features_to_select isn’t divisible by step.
        cv: Union[int, callable, Iterable]
            Determines the cross-validation splitting strategy. Possible inputs for cv are:
                - None, to use the default 5-fold cross-validation,
                - integer, to specify the number of folds.
                - CV splitter,
                - An iterable yielding (train, test) splits as arrays of indices.
        scoring: Union[str, callable]
            A string (see model evaluation documentation) or a scorer callable object / function with signature
            scorer(estimator, X, y).
        verbose: in
            Controls verbosity of output.
        n_jobs: int
            Number of cores to run in parallel while fitting across folds. None means 1 unless in a
            joblib.parallel_backend context. -1 means using all processors.
        """
        if estimator is None:
            estimator = RandomForestClassifier(n_jobs=n_jobs)

        rfe = RFECV(estimator=estimator,
                    step=step,
                    cv=cv,
                    min_features_to_select=min_features_to_select,
                    scoring=scoring,
                    verbose=verbose)
        super().__init__(rfe)


[docs]class SelectFromModelFS(BaseFeatureSelector):
    """
    Class for Select From Model feature selection.

    Meta-transformer for selecting features based on importance weights.
    """

    def __init__(self,
                 estimator: callable = None,
                 threshold: Union[str, float] = None,
                 prefit: bool = False,
                 norm_order: int = 1,
                 max_features: int = None):
        """
        Initialize this SelectFromModelFS Feature Selector.

        Parameters
        ----------
        estimator: callable
            The base estimator from which the transformer is built. This can be both a fitted (if prefit is set to True)
            or a non-fitted estimator. The estimator must have either a feature_importances_ or coef_ attribute after
            fitting.
        threshold: Union[str, float]
            The threshold value to use for feature selection. Features whose importance is greater or equal
            are kept while the others are discarded. If “median” (resp. “mean”), then the threshold value is the
            median (resp. the mean) of the feature importances. A scaling factor (e.g., “1.25*mean”) may also be used.
            If None and if the estimator has a parameter penalty set to l1, either explicitly or implicitly
            (e.g, Lasso), the threshold used is 1e-5. Otherwise, “mean” is used by default.
        prefit: bool
            Whether a prefit model is expected to be passed into the constructor directly or not. If True,
            transform must be called directly and SelectFromModel cannot be used with cross_val_score, GridSearchCV
            and similar utilities that clone the estimator. Otherwise, train the model using fit and then transform
            to do feature selection.
        norm_order: int
            Order of the norm used to filter the vectors of coefficients below threshold in the case where the
            coef_ attribute of the estimator is of dimension 2.
        max_features: int
            The maximum number of features to select. To only select based on max_features, set threshold=-np.inf
        """
        if estimator is None:
            estimator = RandomForestClassifier(n_jobs=-1)
        sfm = SelectFromModel(estimator=estimator,
                              threshold=threshold,
                              prefit=prefit,
                              norm_order=norm_order,
                              max_features=max_features)
        super().__init__(sfm)


[docs]class BorutaAlgorithm(BaseFeatureSelector):
    """
    Class for Boruta feature selection.

    Boruta is an all-relevant feature selection method. It is based on the idea that all features are relevant until
    proven irrelevant. The algorithm is an iterative procedure that consists of two phases: the first phase randomly
    permutes the feature values and evaluates the performance of the classifier. The second phase eliminates the
    features that are less important than their shadow features. The shadow features are copies of the original
    features that are randomly permuted. The algorithm stops when all features are either declared important or
    declared irrelevant.
    """

    def __init__(self,
                 estimator: callable = None,
                 task: str = "classification",
                 support_weak: bool = False,
                 n_estimators: Union[int, str] = 1000,
                 perc: int = 100,
                 alpha: float = 0.05,
                 two_step: bool = True,
                 max_iter: int = 100,
                 random_state: int = None,
                 verbose: int = 0):
        """
        Initialize this BorutaAlgorithm Feature Selector.

        Parameters
        ----------
        estimator: callable
            A supervised learning estimator, with a 'fit' method that returns the feature_importances_ attribute.
            Important features must correspond to high absolute values in the feature_importances_.
        task: str
            The task to perform. Either "classification" or "regression".
        support_weak: bool
            Whether to support weak features or not. If True, weak features are also selected.
        n_estimators: Union[int, str]
            If int sets the number of estimators in the chosen ensemble method. If 'auto' this is determined
            automatically based on the size of the dataset. The other parameters of the used estimators need to be set
            with initialisation.
        perc: int
            Instead of the max we use the percentile defined by the user, to pick our threshold for comparison between
            shadow and real features. The max tend to be too stringent. This provides a finer control over this. The
            lower perc is the more false positives will be picked as relevant but also the less relevant features will
            be left out. The usual trade-off. The default is essentially the vanilla Boruta corresponding to the max.
        alpha: float
             Level at which the corrected p-values will get rejected in both correction steps.
        two_step: bool
            If you want to use the original implementation of Boruta with Bonferroni correction only set this to False.
        max_iter: int
            Maximum number of iterations to perform.
        random_state: int
            Random state to use.
        verbose: int
            Controls verbosity of output.
            - 0: no output
            - 1: displays iteration number
            - 2: which features have been selected already
        """
        self.support_weak = support_weak
        if estimator is None:
            if task == "classification":
                estimator = RandomForestClassifier(
                    n_jobs=-1,
                    max_depth=5
                )
            elif task == "regression":
                estimator = RandomForestRegressor(
                    n_jobs=-1,
                    max_depth=5
                )

        boruta = BorutaPy(
            estimator,
            n_estimators,
            perc,
            alpha,
            two_step,
            max_iter,
            random_state,
            verbose
        )
        super().__init__(boruta)

    def _fit(self, dataset: Dataset) -> 'BorutaAlgorithm':
        """
        Fit the Boruta Algorithm.

        Parameters
        ----------
        dataset: Dataset
            Dataset to fit

        Returns
        -------
        self: BorutaAlgorithm
            The fitted BorutaAlgorithm
        """
        fs = np.stack(dataset.X, axis=0)
        y = dataset.y
        self.feature_selector.fit(fs, y)
        return self

    def _transform(self, dataset: Dataset) -> Dataset:
        """
        Transform the dataset using the selected features.

        Parameters
        ----------
        dataset: Dataset
            Dataset to transform

        Returns
        -------
        transformed_dataset: Dataset
            Transformed dataset
        """
        fs = np.stack(dataset.X, axis=0)
        self.feature_selector.transform(fs, weak=self.support_weak)
        support = [i for i, boolean in enumerate(self.feature_selector.support_) if boolean]
        if self.support_weak:
            weak_support = [i for i, boolean in enumerate(self.feature_selector.support_weak_) if boolean]
            features_to_keep = list(set.union(set(support), set(weak_support)))
        else:
            features_to_keep = support
        dataset.select_features_by_index(list(features_to_keep))
        return dataset