Source code for library.phases.phases_implementation.data_preprocessing.feature_scaling

from library.phases.phases_implementation.dataset.dataset import Dataset
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from library.utils.miscellaneous.save_or_store_plot import save_or_store_plot

import yaml


[docs]
class FeatureScaling:
    def __init__(self, dataset: Dataset) -> None:
        self.dataset = dataset
        self.variables = yaml.load(open("library/configurations.yaml"), Loader=yaml.FullLoader)


[docs]
    def scale_features(
        self,
        scaler: str,
        columnsToScale: list[str],
        save_plots: bool = False,
        save_path: str = None
    ) -> str:
        """
        Scales the features in the dataset

        Parameters
        ----------
        scaler : str
            The scaler to use ('minmax', 'robust', 'standard')
        columnsToScale : list[str]
            The columns to scale
        save_plots : bool
            Whether to save plots before and after scaling
        save_path : str
            The path to save the plots

        Returns
        -------
        str
            Message indicating the number of features scaled  
        """

        # --- Input validation ---
        if not isinstance(scaler, str):
            raise TypeError("Parameter 'scaler' must be a string.")
        if len(columnsToScale) == 0:
            raise ValueError("At least one column must be provided for scaling.")
        if not isinstance(save_plots, bool):
            raise TypeError("Parameter 'save_plots' must be a boolean.")

        # --- Dataset validation ---
        for attr in ['X_train', 'X_val', 'X_test']:
            if not hasattr(self.dataset, attr):
                raise AttributeError(f"The dataset is missing the attribute '{attr}'.")
            missing_cols = [col for col in columnsToScale if col not in getattr(self.dataset, attr).columns]
            if missing_cols:
                raise ValueError(f"The following columns are missing in '{attr}': {missing_cols}")

        # --- Scaler selection ---
        if scaler == "minmax":
            scaler_obj = MinMaxScaler()
        elif scaler == "robust":
            scaler_obj = RobustScaler()
        elif scaler == "standard":
            scaler_obj = StandardScaler()
        else:
            raise ValueError(f"Invalid scaler: {scaler}. Choose from 'minmax', 'robust', or 'standard'.")

        # --- Optional: store original data for plotting ---
        if save_plots:
            try:
                original_data = self.dataset.X_train[columnsToScale].copy()
            except Exception as e:
                raise RuntimeError(f"Failed to copy original data for plotting: {e}")

        # --- Apply scaling ---
        try:
            self.dataset.X_train[columnsToScale] = scaler_obj.fit_transform(self.dataset.X_train[columnsToScale])
            self.dataset.X_val[columnsToScale] = scaler_obj.transform(self.dataset.X_val[columnsToScale])
            self.dataset.X_test[columnsToScale] = scaler_obj.transform(self.dataset.X_test[columnsToScale])
        except Exception as e:
            raise RuntimeError(f"An error occurred during scaling: {e}")

        # --- Optional: plot distributions ---
        if save_plots:
            try:
                max_plots = self.variables["PIPELINE_RUNNER"]["max_plots_per_function"]
                plot_columns = columnsToScale[:max_plots] if max_plots > 0 else columnsToScale
                for col in plot_columns:
                    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
                    sns.histplot(original_data[col], kde=True, ax=axes[0])
                    axes[0].set_title(f"{col} - Before Scaling")
                    sns.histplot(self.dataset.X_train[col], kde=True, ax=axes[1])
                    axes[1].set_title(f"{col} - After Scaling")
                    plt.tight_layout()
                    save_or_store_plot(fig, save_plots, save_path + "/feature_scaling", f"{col}.png")
            except Exception as e:
                raise RuntimeError(f"An error occurred while plotting: {e}")

        return (
            f"Successfully scaled {len(columnsToScale)} features. "
            f"Plotted distributions for the first {min(5, len(columnsToScale))} features.\n"
            f"To check the results run:\n your_pipeline.dataset.X_train.head()"
        )
Source code for library.phases.phases_implementation.data_preprocessing.feature_scaling

Efficient Malware Classfier

Navigation

Related Topics