Source code for library.phases.phases_implementation.data_preprocessing.uncomplete_data
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.stats as stats
from library.phases.phases_implementation.dataset.dataset import Dataset
from library.utils.miscellaneous.save_or_store_plot import save_or_store_plot
[docs]
class UncompleteData:
def __init__(self, dataset: Dataset) -> None:
self.dataset = dataset
[docs]
def analyze_duplicates(self, save_plots: bool = False, save_path: str = None) -> str:
"""Report and optionally visualise duplicate rows.
Parameters
----------
save_plots : bool, default=False
If *True*, a barplot of duplicate counts per column is displayed.
save_path : str
The path to save the plot.
Returns
-------
str
Diagnostic string with the number of duplicate rows found.
"""
# --- Input validation ---
if not isinstance(save_plots, bool):
raise TypeError("Parameter 'save_plots' must be a boolean.")
# --- Dataset structure check ---
if not hasattr(self.dataset, "df"):
raise AttributeError("The dataset does not contain an attribute named 'df'.")
if not hasattr(self.dataset.df, "duplicated"):
raise TypeError("self.dataset.df must be a pandas DataFrame.")
# --- Duplicate analysis ---
try:
duplicates = self.dataset.df.duplicated()
duplicates_sum = duplicates.sum()
except Exception as e:
raise RuntimeError(f"Error checking for duplicates: {e}")
# --- Plotting if requested ---
if save_plots:
if duplicates_sum > 0:
try:
duplicates_by_column = self.dataset.df[duplicates].count()
feature_names = [f'{i+1}' for i in range(len(duplicates_by_column))]
fig, ax = plt.figure(figsize=(15, 4))
sns.barplot(x=feature_names, y=duplicates_by_column.values)
plt.title("Number of Duplicates by Column")
plt.xlabel("Features")
plt.ylabel("Number of Duplicates")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
save_or_store_plot(fig, save_plots, save_path + "/uncomplete_data/duplicates", "duplicates_by_column.png")
except Exception as e:
raise RuntimeError(f"An error occurred while plotting: {e}")
else:
print("No duplicates found in the dataset, no need to plot")
else:
if duplicates_sum == 0:
print("No duplicates found in the dataset")
return f"There are {duplicates_sum} duplicates in the dataset"
[docs]
def remove_duplicates(self) -> str:
"""
Removes duplicates from the dataset
Returns
-------
str
Message indicating the number of duplicates removed
"""
# --- Check that dataset has a DataFrame ---
if not hasattr(self.dataset, "df"):
raise AttributeError("The dataset does not contain an attribute named 'df'.")
if not hasattr(self.dataset.df, "duplicated"):
raise TypeError("self.dataset.df must be a pandas DataFrame.")
try:
duplicates = self.dataset.df.duplicated()
duplicates_sum = duplicates.sum()
except Exception as e:
raise RuntimeError(f"An error occurred while checking for duplicates: {e}")
if duplicates_sum > 0:
try:
print(f"Dataset duplicates:\n{self.dataset.df[duplicates]}")
print(f"There are {duplicates_sum} duplicates in the dataset")
self.dataset.df.drop_duplicates(inplace=True)
except Exception as e:
raise RuntimeError(f"An error occurred while removing duplicates: {e}")
return "Successfully removed duplicates from the dataset"
else:
return "No duplicates found in the dataset"
[docs]
def get_missing_values(self, placeholders: list[str] | None = None, save_plots: bool = False, save_path: str = None):
"""
Return the subset of rows that contain *any* missing value.
Parameters
----------
placeholders : list[str] | None
Additional strings that should be considered *NA* (e.g., "N/A", "-1").
save_plots : bool, default=False
When *True*, show a barplot of missing counts per column.
save_path : str
The path to save the plot.
Returns
-------
pandas.DataFrame | None
Rows that include at least one missing value or *None* if the dataset is complete.
"""
# --- Validation ---
if not hasattr(self.dataset, "df"):
raise AttributeError("The dataset does not contain an attribute named 'df'.")
if not hasattr(self.dataset.df, "isnull"):
raise TypeError("self.dataset.df must be a pandas DataFrame.")
if not isinstance(save_plots, bool):
raise TypeError("Parameter 'plot' must be a boolean.")
try:
# Count native NaNs
missing_values_sum = self.dataset.df.isnull().sum().sum()
# Include custom placeholders
if placeholders:
for placeholder in placeholders:
missing_values_sum += (self.dataset.df == placeholder).sum().sum()
if missing_values_sum > 0:
print(f"Dataset contains {missing_values_sum} missing values")
# Identify rows with missing values or placeholders
condition = self.dataset.df.isnull().any(axis=1)
if placeholders:
for placeholder in placeholders:
condition |= self.dataset.df.eq(placeholder).any(axis=1)
rows_with_missing = self.dataset.df[condition]
print(f"Rows with missing values:\n{rows_with_missing}")
if save_plots:
try:
missing_values_by_column = self.dataset.df.isnull().sum()
fig, ax = plt.figure(figsize=(15, 4))
sns.barplot(x=self.dataset.df.columns, y=missing_values_by_column.values)
plt.title("Missing Values by Column")
plt.xlabel("Features")
plt.ylabel("Number of Missing Values")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
save_or_store_plot(fig, save_plots, save_path + "/uncomplete_data/missing_values", "missing_values_by_column.png")
except Exception as e:
raise RuntimeError(f"An error occurred while plotting: {e}")
return rows_with_missing
else:
msg = "No missing values found in the dataset"
print(msg if not save_plots else msg + ", no need to plot")
return msg
except Exception as e:
raise RuntimeError(f"An error occurred while analyzing missing values: {e}")