Source code for library.phases.phases_implementation.data_preprocessing.data_preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.stats as stats
from library.phases.phases_implementation.dataset.dataset import Dataset
from library.phases.phases_implementation.data_preprocessing.uncomplete_data import UncompleteData
from library.phases.phases_implementation.data_preprocessing.class_imbalance import ClassImbalance
from library.phases.phases_implementation.data_preprocessing.feature_scaling import FeatureScaling
from library.phases.phases_implementation.data_preprocessing.outliers_bounds import OutliersBounds
import random

[docs] class Preprocessing: def __init__(self, dataset: Dataset) -> None: self.dataset = dataset self.uncomplete_data_obj = UncompleteData(dataset=self.dataset) self.class_imbalance_obj = ClassImbalance(dataset=self.dataset) self.feature_scaling_obj = FeatureScaling(dataset=self.dataset) self.outliers_bounds_obj = OutliersBounds(dataset=self.dataset)
[docs] def delete_columns(self, columnsToDelete: list[str]) -> str: """ Deletes the columns in the dataset Parameters: ----------- columnsToDelete : list[str] The columns to delete Returns: -------- str Message indicating the number of columns deleted """ # Validate input type if not isinstance(columnsToDelete, list) or not all(isinstance(col, str) for col in columnsToDelete): raise TypeError("columnsToDelete must be a list of strings.") # Validate dataset attributes for attr in ['X_train', 'X_val', 'X_test']: if not hasattr(self.dataset, attr): raise AttributeError(f"The dataset is missing the attribute '{attr}'.") # Check that all columns exist in all datasets missing_cols = { attr: [col for col in columnsToDelete if col not in getattr(self.dataset, attr).columns] for attr in ['X_train', 'X_val', 'X_test'] } errors = [f"{attr} is missing columns: {cols}" for attr, cols in missing_cols.items() if cols] if errors: raise ValueError("Some columns to delete are missing:\n" + "\n".join(errors)) # Try deleting the columns try: self.dataset.X_train.drop(columns=columnsToDelete, inplace=True) self.dataset.X_val.drop(columns=columnsToDelete, inplace=True) self.dataset.X_test.drop(columns=columnsToDelete, inplace=True) except Exception as e: raise RuntimeError(f"An error occurred while deleting columns: {e}") return (f"Successfully deleted {len(columnsToDelete)} columns. " "To check the results, run: baseline_pipeline.dataset.X_train.head()")