Source code for library.phases.phases_implementation.data_preprocessing.feature_scaling
from library.phases.phases_implementation.dataset.dataset import Dataset
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from library.utils.miscellaneous.save_or_store_plot import save_or_store_plot
import yaml
[docs]
class FeatureScaling:
def __init__(self, dataset: Dataset) -> None:
self.dataset = dataset
self.variables = yaml.load(open("library/configurations.yaml"), Loader=yaml.FullLoader)
[docs]
def scale_features(
self,
scaler: str,
columnsToScale: list[str],
save_plots: bool = False,
save_path: str = None
) -> str:
"""
Scales the features in the dataset
Parameters
----------
scaler : str
The scaler to use ('minmax', 'robust', 'standard')
columnsToScale : list[str]
The columns to scale
save_plots : bool
Whether to save plots before and after scaling
save_path : str
The path to save the plots
Returns
-------
str
Message indicating the number of features scaled
"""
# --- Input validation ---
if not isinstance(scaler, str):
raise TypeError("Parameter 'scaler' must be a string.")
if len(columnsToScale) == 0:
raise ValueError("At least one column must be provided for scaling.")
if not isinstance(save_plots, bool):
raise TypeError("Parameter 'save_plots' must be a boolean.")
# --- Dataset validation ---
for attr in ['X_train', 'X_val', 'X_test']:
if not hasattr(self.dataset, attr):
raise AttributeError(f"The dataset is missing the attribute '{attr}'.")
missing_cols = [col for col in columnsToScale if col not in getattr(self.dataset, attr).columns]
if missing_cols:
raise ValueError(f"The following columns are missing in '{attr}': {missing_cols}")
# --- Scaler selection ---
if scaler == "minmax":
scaler_obj = MinMaxScaler()
elif scaler == "robust":
scaler_obj = RobustScaler()
elif scaler == "standard":
scaler_obj = StandardScaler()
else:
raise ValueError(f"Invalid scaler: {scaler}. Choose from 'minmax', 'robust', or 'standard'.")
# --- Optional: store original data for plotting ---
if save_plots:
try:
original_data = self.dataset.X_train[columnsToScale].copy()
except Exception as e:
raise RuntimeError(f"Failed to copy original data for plotting: {e}")
# --- Apply scaling ---
try:
self.dataset.X_train[columnsToScale] = scaler_obj.fit_transform(self.dataset.X_train[columnsToScale])
self.dataset.X_val[columnsToScale] = scaler_obj.transform(self.dataset.X_val[columnsToScale])
self.dataset.X_test[columnsToScale] = scaler_obj.transform(self.dataset.X_test[columnsToScale])
except Exception as e:
raise RuntimeError(f"An error occurred during scaling: {e}")
# --- Optional: plot distributions ---
if save_plots:
try:
max_plots = self.variables["PIPELINE_RUNNER"]["max_plots_per_function"]
plot_columns = columnsToScale[:max_plots] if max_plots > 0 else columnsToScale
for col in plot_columns:
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
sns.histplot(original_data[col], kde=True, ax=axes[0])
axes[0].set_title(f"{col} - Before Scaling")
sns.histplot(self.dataset.X_train[col], kde=True, ax=axes[1])
axes[1].set_title(f"{col} - After Scaling")
plt.tight_layout()
save_or_store_plot(fig, save_plots, save_path + "/feature_scaling", f"{col}.png")
except Exception as e:
raise RuntimeError(f"An error occurred while plotting: {e}")
return (
f"Successfully scaled {len(columnsToScale)} features. "
f"Plotted distributions for the first {min(5, len(columnsToScale))} features.\n"
f"To check the results run:\n your_pipeline.dataset.X_train.head()"
)