Source code for deepmol.feature_selection.base_feature_selector

from abc import ABC, abstractmethod
from typing import Union, Iterable, List

import numpy as np
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold, chi2, SelectKBest, SelectPercentile, RFECV, SelectFromModel

from deepmol.datasets import Dataset


[docs]class BaseFeatureSelector(ABC): """ Abstract class for feature selection. A `BaseFeatureSelector` uses features present in a Dataset object to select the most important ones. FeatureSelectors which are subclasses of this class should always operate over Dataset Objects. Subclasses need to implement the _select_features method for performing feature selection. """ def __init__(self): """ Initialize the feature selector. """ if self.__class__ == BaseFeatureSelector: raise Exception('Abstract class BaseFeatureSelector should not be instantiated')
[docs] def select_features(self, dataset: Dataset): """ Perform feature selection for the molecules present in the dataset. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- dataset: Dataset Dataset containing the selected features and indexes of the features kept as 'self.features2keep'. """ features_to_keep = self._select_features(dataset) dataset.select_features_by_index(list(features_to_keep)) return dataset
@abstractmethod def _select_features(self, dataset: Dataset) -> np.ndarray: """ Perform feature selection for the molecules present in the dataset. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- features_to_keep: np.ndarray Array containing the indexes of the features to keep. """
[docs]class LowVarianceFS(BaseFeatureSelector): """ Class for Low Variance feature selection. Feature selector that removes all features with low-variance. """ def __init__(self, threshold: float = 0.3): """ Initialize this Feature Selector Parameters ---------- threshold: float Features with a training-set variance lower than this threshold will be removed. """ super().__init__() self.param = threshold def _select_features(self, dataset: Dataset) -> np.ndarray: """ Returns features and indexes of features to keep. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- features_to_keep: np.ndarray Array containing the indexes of the features to keep. """ fs = np.stack(dataset.X, axis=0) vt = VarianceThreshold(threshold=self.param) vt.fit_transform(fs) return vt.get_support(indices=True)
[docs]class KbestFS(BaseFeatureSelector): """ Class for K best feature selection. Select features according to the k-highest scores. """ def __init__(self, k: int = 10, score_func: callable = chi2): """ Initialize this KbestFS Feature Selector. Parameters ---------- k: int Number of top features to select. score_func: callable Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. """ super().__init__() self.k = k self.score_func = score_func def _select_features(self, dataset: Dataset) -> np.ndarray: """ Returns features and indexes of features to keep. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- features_to_keep: np.ndarray Array containing the indexes of the features to keep. """ fs = np.stack(dataset.X, axis=0) y = dataset.y kb = SelectKBest(self.score_func, k=self.k) kb.fit_transform(fs, y) return kb.get_support(indices=True)
[docs]class PercentilFS(BaseFeatureSelector): """ Class for percentil feature selection. Select features according to a percentile of the highest scores. """ def __init__(self, percentil: int = 10, score_func: callable = chi2): """ Initialize the PercentilFS Feature Selector. Parameters ---------- percentil: int Percent of features to keep. score_func: callable Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) or a single array with scores. """ super().__init__() self.percentil = percentil self.score_func = score_func def _select_features(self, dataset: Dataset) -> np.ndarray: """ Returns features and indexes of features to keep. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- features_to_keep: np.ndarray Array containing the indexes of the features to keep. """ fs = np.stack(dataset.X, axis=0) y = dataset.y sp = SelectPercentile(self.score_func, percentile=self.percentil) sp.fit_transform(fs, y) return sp.get_support(indices=True)
# TODO: takes too long to run, check if its normal or a code problem
[docs]class RFECVFS(BaseFeatureSelector): """ Class for RFECV feature selection. Feature ranking with recursive feature elimination and cross-validated selection of the best number of features. """ def __init__(self, estimator: callable = None, step: Union[int, float] = 1, min_features_to_select: int = 1, cv: Union[int, callable, Iterable] = None, scoring: Union[str, callable] = None, verbose: int = 0, n_jobs: int = -1): """ Initialize the RFECVFS Feature Selector. Parameters ---------- estimator: callable A supervised learning estimator with a fit method that provides information about feature importance either through a coef_ attribute or through a feature_importances_ attribute. step: Union[int, float] If greater than or equal to 1, then step corresponds to the (integer) number of features to remove at each iteration. If within (0.0, 1.0), then step corresponds to the percentage (rounded down) of features to remove at each iteration. Note that the last iteration may remove fewer than step features in order to reach min_features_to_select. min_features_to_select: int The minimum number of features to be selected. This number of features will always be scored, even if the difference between the original feature count and min_features_to_select isn’t divisible by step. cv: Union[int, callable, Iterable] Determines the cross-validation splitting strategy. Possible inputs for cv are: - None, to use the default 5-fold cross-validation, - integer, to specify the number of folds. - CV splitter, - An iterable yielding (train, test) splits as arrays of indices. scoring: Union[str, callable] A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y). verbose: in Controls verbosity of output. n_jobs: int Number of cores to run in parallel while fitting across folds. None means 1 unless in a joblib.parallel_backend context. -1 means using all processors. """ super().__init__() if estimator is None: self.estimator = RandomForestClassifier(n_jobs=n_jobs) else: self.estimator = estimator self.step = step self.min_features_to_select = min_features_to_select self.cv = cv self.scoring = scoring self.verbose = verbose def _select_features(self, dataset: Dataset) -> np.ndarray: """ Returns features and indexes of features to keep. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- features_to_keep: np.ndarray Array containing the indexes of the features to keep. """ fs = np.stack(dataset.X, axis=0) y = dataset.y rfe = RFECV(self.estimator, step=self.step, cv=self.cv, min_features_to_select=self.min_features_to_select, scoring=self.scoring, verbose=self.verbose) rfe.fit_transform(fs, y) return rfe.get_support(indices=True)
[docs]class SelectFromModelFS(BaseFeatureSelector): """ Class for Select From Model feature selection. Meta-transformer for selecting features based on importance weights. """ def __init__(self, estimator: callable = None, threshold: Union[str, float] = None, prefit: bool = False, norm_order: int = 1, max_features: int = None): """ Initialize this SelectFromModelFS Feature Selector. Parameters ---------- estimator: callable The base estimator from which the transformer is built. This can be both a fitted (if prefit is set to True) or a non-fitted estimator. The estimator must have either a feature_importances_ or coef_ attribute after fitting. threshold: Union[str, float] The threshold value to use for feature selection. Features whose importance is greater or equal are kept while the others are discarded. If “median” (resp. “mean”), then the threshold value is the median (resp. the mean) of the feature importances. A scaling factor (e.g., “1.25*mean”) may also be used. If None and if the estimator has a parameter penalty set to l1, either explicitly or implicitly (e.g, Lasso), the threshold used is 1e-5. Otherwise, “mean” is used by default. prefit: bool Whether a prefit model is expected to be passed into the constructor directly or not. If True, transform must be called directly and SelectFromModel cannot be used with cross_val_score, GridSearchCV and similar utilities that clone the estimator. Otherwise, train the model using fit and then transform to do feature selection. norm_order: int Order of the norm used to filter the vectors of coefficients below threshold in the case where the coef_ attribute of the estimator is of dimension 2. max_features: int The maximum number of features to select. To only select based on max_features, set threshold=-np.inf """ super().__init__() if estimator is None: self.estimator = RandomForestClassifier(n_jobs=-1) else: self.estimator = estimator self.threshold = threshold self.prefit = prefit self.norm_order = norm_order self.max_features = max_features def _select_features(self, dataset: Dataset) -> np.ndarray: """ Returns features and indexes of features to keep. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- features_to_keep: np.ndarray Array containing the indexes of the features to keep. """ fs = np.stack(dataset.X, axis=0) y = dataset.y sfm = SelectFromModel(self.estimator, threshold=self.threshold, prefit=self.prefit, norm_order=self.norm_order, max_features=self.max_features) sfm.fit_transform(fs, y) return sfm.get_support(indices=True)
[docs]class BorutaAlgorithm(BaseFeatureSelector): """ Class for Boruta feature selection. Boruta is an all-relevant feature selection method. It is based on the idea that all features are relevant until proven irrelevant. The algorithm is an iterative procedure that consists of two phases: the first phase randomly permutes the feature values and evaluates the performance of the classifier. The second phase eliminates the features that are less important than their shadow features. The shadow features are copies of the original features that are randomly permuted. The algorithm stops when all features are either declared important or declared irrelevant. """ def __init__(self, estimator: callable = None, task: str = "classification", support_weak: bool = False, n_estimators: Union[int, str] = 1000, perc: int = 100, alpha: float = 0.05, two_step: bool = True, max_iter: int = 100, random_state: int = None, verbose: int = 0): """ Initialize this BorutaAlgorithm Feature Selector. Parameters ---------- estimator: callable A supervised learning estimator, with a 'fit' method that returns the feature_importances_ attribute. Important features must correspond to high absolute values in the feature_importances_. task: str The task to perform. Either "classification" or "regression". support_weak: bool Whether to support weak features or not. If True, weak features are also selected. n_estimators: Union[int, str] If int sets the number of estimators in the chosen ensemble method. If 'auto' this is determined automatically based on the size of the dataset. The other parameters of the used estimators need to be set with initialisation. perc: int Instead of the max we use the percentile defined by the user, to pick our threshold for comparison between shadow and real features. The max tend to be too stringent. This provides a finer control over this. The lower perc is the more false positives will be picked as relevant but also the less relevant features will be left out. The usual trade-off. The default is essentially the vanilla Boruta corresponding to the max. alpha: float Level at which the corrected p-values will get rejected in both correction steps. two_step: bool If you want to use the original implementation of Boruta with Bonferroni correction only set this to False. max_iter: int Maximum number of iterations to perform. random_state: int Random state to use. verbose: int Controls verbosity of output. - 0: no output - 1: displays iteration number - 2: which features have been selected already """ super().__init__() self.support_weak = support_weak if estimator is None: if task == "classification": self.estimator = RandomForestClassifier( n_jobs=-1, max_depth=5 ) elif task == "regression": self.estimator = RandomForestRegressor( n_jobs=-1, max_depth=5 ) else: self.estimator = estimator self.boruta = BorutaPy( self.estimator, n_estimators, perc, alpha, two_step, max_iter, random_state, verbose ) def _select_features(self, dataset: Dataset) -> np.ndarray: """ Returns features and indexes of features to keep. Parameters ---------- dataset: Dataset Dataset to perform feature selection on Returns ------- features_to_keep: np.ndarray Array containing the indexes of the features to keep. """ fs = np.stack(dataset.X, axis=0) y = dataset.y self.boruta.fit(fs, y) self.boruta.transform(fs, weak=self.support_weak) support = [i for i, boolean in enumerate(self.boruta.support_) if boolean] if self.support_weak: weak_support = [i for i, boolean in enumerate(self.boruta.support_weak_) if boolean] features_to_keep = list(set.union(set(support), set(weak_support))) else: features_to_keep = support return np.array(features_to_keep)