Source code for library.phases.phases_implementation.dataset.split.strategies.noTimeSeries

import time

import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import matplotlib.dates as mdates

from sklearn.model_selection import train_test_split

import seaborn as sns
import numpy as np
import pandas as pd
import os

from library.phases.phases_implementation.dataset.split.strategies.base import Split
from library.utils.miscellaneous.save_or_store_plot import save_or_store_plot


[docs]
class NoTimeSeries(Split):
      def __init__(self, dataset) -> None:
            super().__init__(dataset)


[docs]
      def split_data(self, 
                 y_column: str, 
                 otherColumnsToDrop: list[str] = [], 
                 train_size: float = 0.8, 
                 validation_size: float = 0.1, 
                 test_size: float = 0.1,
                 save_plots: bool = False,
                 save_path: str = None
                 ) -> None:
            """
            Splits the dataframe into training, validation and test sets

            Parameters
            ----------
            y_column : str
                  The column name of the target variable
            otherColumnsToDrop : list[str]
                  The columns to drop from the dataframe (e.g: record identifiers)
            train_size : float
                  The size of the training set
            validation_size : float
                  The size of the validation set
            test_size : float
                  The size of the test set
            plot_distribution : bool
                  Whether to plot the distribution of the features
            Returns
            -------
            X_train : pd.DataFrame
                  The training set
            X_val : pd.DataFrame
                  The validation set
            X_test : pd.DataFrame
                  The test set
            y_train : pd.Series
                  The training set
            y_val : pd.Series
                  The validation set
            y_test : pd.Series
                  The test set
            """
            X, y = self.__get_X_y__(y_column, otherColumnsToDrop)
            assert train_size + validation_size + test_size == 1, "The sum of the sizes must be 1"
            X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=validation_size + test_size, random_state=self.dataset.random_state) 
            X_val , X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size/(validation_size + test_size), random_state=self.dataset.random_state) 
            print(f"X_train: {X_train.shape}")
            print(f"X_val: {X_val.shape}")
            print(f"X_test: {X_test.shape}")
            print(f"y_train: {y_train.shape}")
            print(f"y_val: {y_val.shape}")
            print(f"y_test: {y_test.shape}")
            self.dataset.X_train, self.dataset.X_val, self.dataset.X_test, self.dataset.y_train, self.dataset.y_val, self.dataset.y_test = X_train, X_val, X_test, y_train, y_val, y_test
            if save_plots:
                  super().plot_per_set_distribution(X.columns, save_plots, save_path)
            return self.dataset.X_train.head().to_dict()




[docs]
      def asses_split_classifier(self, p: float, step: float, upper_bound: float = .50, save_plots: bool = False, save_path: str = None) -> pd.DataFrame:
            """
            Assesses the split of the dataframe for classification tasks.

            Parameters
            ----------
            p : float
                  The percentage of the dataframe to split
            step : float
                  The step size for the split
            upper_bound : float
                  The upper bound for the split
            plot : bool
                  If True, the split assessment will be plotted
            Returns
            -------
            df_split_assesment : pd.DataFrame
                  A dataframe with the split assessment
            """

            if self.dataset.modelTask != "classification":
                  raise ValueError("The model task must be classification")
            

            computeSE = lambda p, n : np.sqrt((p*(1-p))/n)
            df_split_assesment = pd.DataFrame()
            hold_out_size = step
            priorSE = 0
            while hold_out_size <= upper_bound:
                  assert hold_out_size < 1 
                  train_size_percentage  = 1 - hold_out_size
                  train_size_count = round(self.dataset.df.shape[0] * train_size_percentage, 0)

                  val_size_percentage = hold_out_size / 2
                  val_size_count = round(self.dataset.df.shape[0] * (hold_out_size / 2),0)

                  test_size_percentage = hold_out_size / 2
                  test_size_count = round(self.dataset.df.shape[0] * (hold_out_size / 2),0)


                  currentSE = computeSE(p, test_size_count)
                  differenceToPriorSE = currentSE - priorSE
                  differenceToPriorSE_percentage = (currentSE - priorSE) /  priorSE
                  priorSE = currentSE

                  new_row = pd.DataFrame([{
                        "train_size (%)": train_size_percentage, 
                        "train_size_count": train_size_count,
                        "validation_size (%)": val_size_percentage ,
                        "validation_size_count": val_size_count,
                        "test_size (%)": test_size_percentage, 
                        "test_size_coount": test_size_count,
                        "currentSE": currentSE ,
                        "differenceToPriorSE": differenceToPriorSE,
                        "differenceToPriorSE (%)": differenceToPriorSE_percentage,
                        }])

                  # Concatenate the new row with your existing DataFrame
                  df_split_assesment = pd.concat([df_split_assesment, new_row], ignore_index=True)
                  hold_out_size += step
            if save_plots:
                  fig, ax1 = plt.subplots()

                  color = 'tab:blue'
                  ax1.set_xlabel('Training Set Percentage')
                  ax1.set_ylabel('Current SE', color=color)
                  ax1.plot(df_split_assesment["train_size (%)"], df_split_assesment["currentSE"], marker='o', color=color)
                  ax1.tick_params(axis='y', labelcolor=color)

                  ax1.xaxis.set_major_locator(MultipleLocator(0.05))

                  ax2 = ax1.twinx()  
                  color = 'tab:red'
                  ax2.set_ylabel('Difference to Prior SE (%)', color=color)
                  ax2.plot(df_split_assesment["train_size (%)"][1:],  df_split_assesment["differenceToPriorSE (%)"][1:], marker='x', linestyle='--', color=color)
                  ax2.tick_params(axis='y', labelcolor=color)


                  plt.title('Holdout Split Trade-Off: Training Set vs SE')
                  save_or_store_plot(fig, save_plots, save_path + "/split/split_trade_off", "split_trade_off.png")
                  
            self.df_split_assesment = df_split_assesment
            return df_split_assesment
Source code for library.phases.phases_implementation.dataset.split.strategies.noTimeSeries

Efficient Malware Classfier

Navigation

Related Topics