Source code for library.phases.phases_implementation.data_preprocessing.data_preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.stats as stats
from library.phases.phases_implementation.dataset.dataset import Dataset
from library.phases.phases_implementation.data_preprocessing.uncomplete_data import UncompleteData
from library.phases.phases_implementation.data_preprocessing.class_imbalance import ClassImbalance
from library.phases.phases_implementation.data_preprocessing.feature_scaling import FeatureScaling
from library.phases.phases_implementation.data_preprocessing.outliers_bounds import OutliersBounds
import random


[docs]
class Preprocessing:
    def __init__(self, dataset: Dataset) -> None:
        self.dataset = dataset
        self.uncomplete_data_obj = UncompleteData(dataset=self.dataset)
        self.class_imbalance_obj = ClassImbalance(dataset=self.dataset)
        self.feature_scaling_obj = FeatureScaling(dataset=self.dataset)
        self.outliers_bounds_obj = OutliersBounds(dataset=self.dataset)
        



[docs]
    def delete_columns(self, columnsToDelete: list[str]) -> str:
      """ 
      Deletes the columns in the dataset

      Parameters:
      -----------
      columnsToDelete : list[str]
          The columns to delete

      Returns:
      --------
      str
          Message indicating the number of columns deleted
      """

      # Validate input type
      if not isinstance(columnsToDelete, list) or not all(isinstance(col, str) for col in columnsToDelete):
          raise TypeError("columnsToDelete must be a list of strings.")

      # Validate dataset attributes
      for attr in ['X_train', 'X_val', 'X_test']:
          if not hasattr(self.dataset, attr):
              raise AttributeError(f"The dataset is missing the attribute '{attr}'.")

      # Check that all columns exist in all datasets
      missing_cols = {
          attr: [col for col in columnsToDelete if col not in getattr(self.dataset, attr).columns]
          for attr in ['X_train', 'X_val', 'X_test']
      }
      errors = [f"{attr} is missing columns: {cols}" for attr, cols in missing_cols.items() if cols]
      if errors:
          raise ValueError("Some columns to delete are missing:\n" + "\n".join(errors))

      # Try deleting the columns
      try:
          self.dataset.X_train.drop(columns=columnsToDelete, inplace=True)
          self.dataset.X_val.drop(columns=columnsToDelete, inplace=True)
          self.dataset.X_test.drop(columns=columnsToDelete, inplace=True)
      except Exception as e:
          raise RuntimeError(f"An error occurred while deleting columns: {e}")

      return (f"Successfully deleted {len(columnsToDelete)} columns. "
              "To check the results, run: baseline_pipeline.dataset.X_train.head()")
Source code for library.phases.phases_implementation.data_preprocessing.data_preprocessing

Efficient Malware Classfier

Navigation

Related Topics