Source code for library.phases.phases_implementation.feature_analysis.feature_selection.automatic

from library.phases.phases_implementation.dataset.dataset import Dataset

from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from boruta import BorutaPy

from abc import ABC, abstractmethod


[docs]
class AutomaticFeatureSelection():
      """
      """
      def __init__(self, dataset: Dataset) -> None:
            self.dataset = dataset
            self.options = {
                  "L1": L1AutomaticFeatureSelection,
                  "Boruta": BorutaAutomaticFeatureSelection
            }
      

[docs]
      def fit(self, type: str, max_iter: int, delete_features: bool, save_plots: bool = False, save_path: str = ""):
            print(f"Running {type} feature selection")
            return self.options[type](self.dataset).fit(max_iter, delete_features, save_plots, save_path)

      

[docs]
      def speak(self, message: str):
            print(f"{message} from {id(self)}. You are at automatic feature selection!")





[docs]
class AutomaticFeatureSelectionFactory(ABC):
      def __init__(self, dataset: Dataset):
            self.dataset = dataset


[docs]
      @abstractmethod
      def fit(self, max_iter: int, delete_features: bool, save_plots: bool = False, save_path: str = ""):
            pass





[docs]
class L1AutomaticFeatureSelection(AutomaticFeatureSelectionFactory):
      def __init__(self, dataset: Dataset):
            super().__init__(dataset)


[docs]
      def fit(self, max_iter: int = 1000, delete_features: bool = True, save_plots: bool = False, save_path: str = ""):
            """
            Automatically selects the features that are most predictive of the target variable using the L1 regularization method

            Parameters
            ----------
            isRegression : bool
                  Whether the model is a regression model
            print_results : bool
                  Whether to print the results

            Returns
            -------
            tuple
            The predictive power features and the excluded features
            """
            if self.dataset.modelTask == "regression":
                  model = Lasso(max_iter=max_iter)
            else:
                  model = LogisticRegression(n_jobs=-1, max_iter=max_iter)

            model.fit(self.dataset.X_train, self.dataset.y_train)
            coefficients = model.coef_ # FOR MULTICLASS RETURNS A LIST OF COEFFICEINTS. CURRENTLY NOT SUPPORTED.

            predictivePowerFeatures = set()
            for i in range(len(coefficients)):
                  if abs(coefficients[i]) > 0:
                        predictivePowerFeatures.add(self.dataset.X_train.columns[i])
                        excludedFeatures = set(self.dataset.X_train.columns) - predictivePowerFeatures
            print(f"Number of predictive power variables: {len(predictivePowerFeatures)}")
            print(f"Number of excluded variables: {len(excludedFeatures)}")
            if delete_features:
                        self.dataset.X_train.drop(columns=excludedFeatures, inplace=True)
                        self.dataset.X_val.drop(columns=excludedFeatures, inplace=True)
                        self.dataset.X_test.drop(columns=excludedFeatures, inplace=True)
            return predictivePowerFeatures, excludedFeatures, coefficients




[docs]
class BorutaAutomaticFeatureSelection(AutomaticFeatureSelectionFactory):
      def __init__(self, dataset: Dataset):
            super().__init__(dataset)     


[docs]
      def fit(self, max_iter: int = 100, delete_features: bool = True, save_plots: bool = False, save_path: str = ""):
            """
            Automatically selects the features that are most predictive of the target variable using the Boruta method

            Parameters
            ----------
            boruta_model : BorutaPy
                  The Boruta model

            Returns
            -------
            tuple
            The predictive power features and the excluded features
            """
            RANDOM_STATE = 99
            if self.dataset.modelTask == "regression":
                  rf = RandomForestRegressor(
                  n_estimators=100,    
                  n_jobs=-1, 
                  random_state=RANDOM_STATE
                  )
            else:
                  rf = RandomForestClassifier(
                  n_estimators=100,    
                  n_jobs=-1, 
                  class_weight='balanced',
                  random_state=RANDOM_STATE
                  )
            boruta_model = BorutaPy(rf, 
                                    n_estimators='auto',
                                    verbose=3, 
                                    random_state=RANDOM_STATE, 
                                    max_iter=max_iter,
                                    
                                    )
            boruta_model.fit(self.dataset.X_train.values, 
                              self.dataset.y_train.values)
            selected_mask = boruta_model.support_
            selected_features = set(self.dataset.X_train.columns[selected_mask])
            excludedFeatures = set(self.dataset.X_train.columns) - selected_features
            print(f"Number of predictive power variables: {len(selected_features)}")
            print(f"Number of excluded variables: {len(excludedFeatures)}") 
            if delete_features:     
                  self.dataset.X_train.drop(columns=excludedFeatures, inplace=True)
                  self.dataset.X_val.drop(columns=excludedFeatures, inplace=True)
                  self.dataset.X_test.drop(columns=excludedFeatures, inplace=True)
            return selected_features, excludedFeatures
Source code for library.phases.phases_implementation.feature_analysis.feature_selection.automatic

Efficient Malware Classfier

Navigation

Related Topics