Source code for library.phases.phases_implementation.feature_analysis.feature_selection.automatic

from library.phases.phases_implementation.dataset.dataset import Dataset

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from boruta import BorutaPy

from abc import ABC, abstractmethod

[docs] class AutomaticFeatureSelection(): """ """ def __init__(self, dataset: Dataset) -> None: self.dataset = dataset self.options = { "L1": L1AutomaticFeatureSelection, "Boruta": BorutaAutomaticFeatureSelection }
[docs] def fit(self, type: str, max_iter: int, delete_features: bool, save_plots: bool = False, save_path: str = ""): print(f"Running {type} feature selection") return self.options[type](self.dataset).fit(max_iter, delete_features, save_plots, save_path)
[docs] def speak(self, message: str): print(f"{message} from {id(self)}. You are at automatic feature selection!")
[docs] class AutomaticFeatureSelectionFactory(ABC): def __init__(self, dataset: Dataset): self.dataset = dataset
[docs] @abstractmethod def fit(self, max_iter: int, delete_features: bool, save_plots: bool = False, save_path: str = ""): pass
[docs] class L1AutomaticFeatureSelection(AutomaticFeatureSelectionFactory): def __init__(self, dataset: Dataset): super().__init__(dataset)
[docs] def fit(self, max_iter: int = 1000, delete_features: bool = True, save_plots: bool = False, save_path: str = ""): """ Automatically selects the features that are most predictive of the target variable using the L1 regularization method Parameters ---------- isRegression : bool Whether the model is a regression model print_results : bool Whether to print the results Returns ------- tuple The predictive power features and the excluded features """ if self.dataset.modelTask == "regression": model = Lasso(max_iter=max_iter) else: model = LogisticRegression(n_jobs=-1, max_iter=max_iter) model.fit(self.dataset.X_train, self.dataset.y_train) coefficients = model.coef_ # FOR MULTICLASS RETURNS A LIST OF COEFFICEINTS. CURRENTLY NOT SUPPORTED. predictivePowerFeatures = set() for i in range(len(coefficients)): if abs(coefficients[i]) > 0: predictivePowerFeatures.add(self.dataset.X_train.columns[i]) excludedFeatures = set(self.dataset.X_train.columns) - predictivePowerFeatures print(f"Number of predictive power variables: {len(predictivePowerFeatures)}") print(f"Number of excluded variables: {len(excludedFeatures)}") if delete_features: self.dataset.X_train.drop(columns=excludedFeatures, inplace=True) self.dataset.X_val.drop(columns=excludedFeatures, inplace=True) self.dataset.X_test.drop(columns=excludedFeatures, inplace=True) return predictivePowerFeatures, excludedFeatures, coefficients
[docs] class BorutaAutomaticFeatureSelection(AutomaticFeatureSelectionFactory): def __init__(self, dataset: Dataset): super().__init__(dataset)
[docs] def fit(self, max_iter: int = 100, delete_features: bool = True, save_plots: bool = False, save_path: str = ""): """ Automatically selects the features that are most predictive of the target variable using the Boruta method Parameters ---------- boruta_model : BorutaPy The Boruta model Returns ------- tuple The predictive power features and the excluded features """ RANDOM_STATE = 99 if self.dataset.modelTask == "regression": rf = RandomForestRegressor( n_estimators=100, n_jobs=-1, random_state=RANDOM_STATE ) else: rf = RandomForestClassifier( n_estimators=100, n_jobs=-1, class_weight='balanced', random_state=RANDOM_STATE ) boruta_model = BorutaPy(rf, n_estimators='auto', verbose=3, random_state=RANDOM_STATE, max_iter=max_iter, ) boruta_model.fit(self.dataset.X_train.values, self.dataset.y_train.values) selected_mask = boruta_model.support_ selected_features = set(self.dataset.X_train.columns[selected_mask]) excludedFeatures = set(self.dataset.X_train.columns) - selected_features print(f"Number of predictive power variables: {len(selected_features)}") print(f"Number of excluded variables: {len(excludedFeatures)}") if delete_features: self.dataset.X_train.drop(columns=excludedFeatures, inplace=True) self.dataset.X_val.drop(columns=excludedFeatures, inplace=True) self.dataset.X_test.drop(columns=excludedFeatures, inplace=True) return selected_features, excludedFeatures