Source code for library.phases.phases_implementation.dataset.split.strategies.timeSeries


from library.phases.phases_implementation.dataset.split.strategies.base import Split

import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import matplotlib.dates as mdates


[docs] class TimeSeries(Split): def __init__(self, dataset) -> None: super().__init__(dataset)
[docs] def split_data(self, y_column: str, otherColumnsToDrop: list[str] = [], train_size: float = 0.8, validation_size: float = 0.1, test_size: float = 0.1, plot_distribution: bool = True, **kwargs ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.Series, pd.Series, pd.Series]: """ Splits the dataframe into training, validation and test sets for time series data Parameters ---------- y_column : str The column name of the target variable otherColumnsToDrop : list[str] The columns to drop from the dataframe (e.g: record identifiers) train_size : float The proportion of data to use for training validation_size : float The proportion of data to use for validation test_size : float The proportion of data to use for testing orderColumns : list[str] The columns to order the dataframe by (e.g., date, timestamp) plot_distribution : bool Whether to plot the distribution of the features Returns ------- X_train : pd.DataFrame The training set features X_val : pd.DataFrame The validation set features X_test : pd.DataFrame The test set features y_train : pd.Series The training set target y_val : pd.Series The validation set target y_test : pd.Series The test set target """ assert train_size + validation_size + test_size == 1, "The sum of the sizes must be 1" orderColumns = kwargs.get("orderColumns", []) plot_time_splits = kwargs.get("plot_time_splits", True) assert len(orderColumns) > 0, "The order columns must be provided" # Order the dataframe by the order columns self.dataset.df = self.dataset.df.sort_values(by=orderColumns) X, y = super().__get_X_y__(y_column, otherColumnsToDrop) # Calculate split indices n = len(X) train_end = int(n * train_size) val_end = train_end + int(n * validation_size) # Split the dataframe into training, validation and test sets X_train = X.iloc[:train_end] y_train = y.iloc[:train_end] X_val = X.iloc[train_end:val_end] y_val = y.iloc[train_end:val_end] X_test = X.iloc[val_end:] y_test = y.iloc[val_end:] self.dataset.X_train, self.dataset.X_val, self.dataset.X_test = X_train, X_val, X_test self.dataset.y_train, self.dataset.y_val, self.dataset.y_test = y_train, y_val, y_test if plot_distribution: super().plot_per_set_distribution(X.columns) if plot_time_splits: self.plot_time_splits()
[docs] def plot_time_splits(self): """Plots the time splits of the dataframe""" plt.figure(figsize=(20, 3)) plt.plot(self.dataset.X_train['dteday'], [1] * len(self.dataset.X_train), '|', label='Train') plt.plot(self.dataset.X_val['dteday'], [1.5] * len(self.dataset.X_val), '|', label='Val') plt.plot(self.dataset.X_test['dteday'], [2] * len(self.dataset.X_test), '|', label='Test') plt.legend() plt.yticks([]) ax = plt.gca() # Set locator for ticks (more dense) ax.xaxis.set_major_locator(mdates.AutoDateLocator()) # Date format (year-month) ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m')) plt.xticks(rotation=45, fontsize=8) # smaller font plt.xlabel('Date') plt.title('Chronological Order Check of Train/Val/Test Splits') plt.tight_layout() plt.show()