import time
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import matplotlib.dates as mdates
from sklearn.model_selection import train_test_split
import seaborn as sns
import numpy as np
import pandas as pd
import os
from library.phases.phases_implementation.dataset.split.strategies.base import Split
from library.utils.miscellaneous.save_or_store_plot import save_or_store_plot
[docs]
class NoTimeSeries(Split):
def __init__(self, dataset) -> None:
super().__init__(dataset)
[docs]
def split_data(self,
y_column: str,
otherColumnsToDrop: list[str] = [],
train_size: float = 0.8,
validation_size: float = 0.1,
test_size: float = 0.1,
save_plots: bool = False,
save_path: str = None
) -> None:
"""
Splits the dataframe into training, validation and test sets
Parameters
----------
y_column : str
The column name of the target variable
otherColumnsToDrop : list[str]
The columns to drop from the dataframe (e.g: record identifiers)
train_size : float
The size of the training set
validation_size : float
The size of the validation set
test_size : float
The size of the test set
plot_distribution : bool
Whether to plot the distribution of the features
Returns
-------
X_train : pd.DataFrame
The training set
X_val : pd.DataFrame
The validation set
X_test : pd.DataFrame
The test set
y_train : pd.Series
The training set
y_val : pd.Series
The validation set
y_test : pd.Series
The test set
"""
X, y = self.__get_X_y__(y_column, otherColumnsToDrop)
assert train_size + validation_size + test_size == 1, "The sum of the sizes must be 1"
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=validation_size + test_size, random_state=self.dataset.random_state)
X_val , X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=test_size/(validation_size + test_size), random_state=self.dataset.random_state)
print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_val: {y_val.shape}")
print(f"y_test: {y_test.shape}")
self.dataset.X_train, self.dataset.X_val, self.dataset.X_test, self.dataset.y_train, self.dataset.y_val, self.dataset.y_test = X_train, X_val, X_test, y_train, y_val, y_test
if save_plots:
super().plot_per_set_distribution(X.columns, save_plots, save_path)
return self.dataset.X_train.head().to_dict()
[docs]
def asses_split_classifier(self, p: float, step: float, upper_bound: float = .50, save_plots: bool = False, save_path: str = None) -> pd.DataFrame:
"""
Assesses the split of the dataframe for classification tasks.
Parameters
----------
p : float
The percentage of the dataframe to split
step : float
The step size for the split
upper_bound : float
The upper bound for the split
plot : bool
If True, the split assessment will be plotted
Returns
-------
df_split_assesment : pd.DataFrame
A dataframe with the split assessment
"""
if self.dataset.modelTask != "classification":
raise ValueError("The model task must be classification")
computeSE = lambda p, n : np.sqrt((p*(1-p))/n)
df_split_assesment = pd.DataFrame()
hold_out_size = step
priorSE = 0
while hold_out_size <= upper_bound:
assert hold_out_size < 1
train_size_percentage = 1 - hold_out_size
train_size_count = round(self.dataset.df.shape[0] * train_size_percentage, 0)
val_size_percentage = hold_out_size / 2
val_size_count = round(self.dataset.df.shape[0] * (hold_out_size / 2),0)
test_size_percentage = hold_out_size / 2
test_size_count = round(self.dataset.df.shape[0] * (hold_out_size / 2),0)
currentSE = computeSE(p, test_size_count)
differenceToPriorSE = currentSE - priorSE
differenceToPriorSE_percentage = (currentSE - priorSE) / priorSE
priorSE = currentSE
new_row = pd.DataFrame([{
"train_size (%)": train_size_percentage,
"train_size_count": train_size_count,
"validation_size (%)": val_size_percentage ,
"validation_size_count": val_size_count,
"test_size (%)": test_size_percentage,
"test_size_coount": test_size_count,
"currentSE": currentSE ,
"differenceToPriorSE": differenceToPriorSE,
"differenceToPriorSE (%)": differenceToPriorSE_percentage,
}])
# Concatenate the new row with your existing DataFrame
df_split_assesment = pd.concat([df_split_assesment, new_row], ignore_index=True)
hold_out_size += step
if save_plots:
fig, ax1 = plt.subplots()
color = 'tab:blue'
ax1.set_xlabel('Training Set Percentage')
ax1.set_ylabel('Current SE', color=color)
ax1.plot(df_split_assesment["train_size (%)"], df_split_assesment["currentSE"], marker='o', color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax1.xaxis.set_major_locator(MultipleLocator(0.05))
ax2 = ax1.twinx()
color = 'tab:red'
ax2.set_ylabel('Difference to Prior SE (%)', color=color)
ax2.plot(df_split_assesment["train_size (%)"][1:], df_split_assesment["differenceToPriorSE (%)"][1:], marker='x', linestyle='--', color=color)
ax2.tick_params(axis='y', labelcolor=color)
plt.title('Holdout Split Trade-Off: Training Set vs SE')
save_or_store_plot(fig, save_plots, save_path + "/split/split_trade_off", "split_trade_off.png")
self.df_split_assesment = df_split_assesment
return df_split_assesment