Source code for library.phases.phases_implementation.dataset.dataset

import time

import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import matplotlib.dates as mdates


import seaborn as sns
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

from imblearn.over_sampling import SMOTENC
from boruta import BorutaPy
from statsmodels.stats.outliers_influence import variance_inflation_factor


from library.phases.phases_implementation.dataset.split.strategies.noTimeSeries import NoTimeSeries
from library.phases.phases_implementation.dataset.split.strategies.timeSeries import TimeSeries


# Global variables
RANDOM_STATE = 99

[docs] class Dataset: """ Created dataframe, provides info, splits and encodes""" def __init__(self, dataset_path: str, model_task: str, random_state: int = RANDOM_STATE) -> None: """ Creates a dataframe from a csv file Parameters ---------- path : str The path to the dataframe problem_type : str The type of problem to solve (e.g: classification, regression) random_state : int The random state to use """ assert model_task in ["classification_timeSeries", "regression_timeSeries", "classification", "regression"], "The model task must be either classification or regression" self.df = pd.read_csv(dataset_path) self.random_state = random_state splitted_type = model_task.split("_") self.modelTask = splitted_type[0] self.isTimeSeries = len(splitted_type) > 1 and (splitted_type[-1] == "timeSeries") self.split = create_split_strategy(self, self.isTimeSeries)
[docs] def eliminate_variables(self, variables_to_eliminate: list[str], after_split: bool = False): if after_split: self.X_train.drop(columns=variables_to_eliminate, inplace=True) self.X_val.drop(columns=variables_to_eliminate, inplace=True) self.X_test.drop(columns=variables_to_eliminate, inplace=True) else: self.df.drop(columns=variables_to_eliminate, inplace=True)
[docs] def create_split_strategy(dataset, is_time_series: bool = False): """ Factory method to create the appropriate split strategy based on the dataset type. Parameters ---------- dataset : Dataset The dataset to split is_time_series : bool Whether the dataset is a time series Returns ------- Split The appropriate split strategy """ if is_time_series: return TimeSeries(dataset) return NoTimeSeries(dataset)