Source code for library.phases.runners.dataset_runner
import numpy as np
from library.utils.phase_runner_definition.phase_runner import PhaseRunner
from library.pipeline.pipeline_manager import PipelineManager
[docs]
class DatasetRunner(PhaseRunner):
def __init__(self, pipeline_manager: PipelineManager, include_plots: bool = False, save_path: str = "") -> None:
super().__init__(pipeline_manager, include_plots, save_path)
def _ensembled_pipeline_feature_engineering(self) -> None:
# FEDE (expand here)
ensembled_pipeline = self.pipeline_manager.pipelines["not_baseline"]["ensembled"]
df = ensembled_pipeline.dataset.df
#df["Name_of_some_column"] = np.log(df["Name_of_some_column"])
def _run_feature_engineering_pre_split(self) -> None:
"""
TO READ BY FEDE:
- Each pipelin has its own function (just to make it easier to read)
- Above you have an example of applying log transformation for a specific column
- Note that currettly all pipelines are the same because divergance is done later in modelling (and juan probably will put it up to data preprocessing). If u want
to do somethign here that is pipeline specific you need to create the divergence (below you have a function that is all-commented out on showing how to diverge
all pipelines, as an example)
"""
# FEDE (expand here)
# pre split
self._ensembled_pipeline_feature_engineering()
[docs]
def run(self) -> None:
# Select the first pipeline.
print(self.pipeline_manager.pipelines)
pipelines = list(self.pipeline_manager.pipelines["not_baseline"].values())
default_pipeline = pipelines[0]
feature_engineering_results = self._run_feature_engineering_pre_split()
split_df = default_pipeline.dataset.split.asses_split_classifier(
p=self.pipeline_manager.variables["dataset_runner"]["split_df"]["p"],
step=self.pipeline_manager.variables["dataset_runner"]["split_df"]["step"],
save_plots=self.include_plots,
save_path=self.save_path
)
encoding_df = default_pipeline.dataset.split.split_data(
y_column=self.pipeline_manager.variables["dataset_runner"]["encoding"]["y_column"],
train_size=self.pipeline_manager.variables["dataset_runner"]["encoding"]["train_size"],
validation_size=self.pipeline_manager.variables["dataset_runner"]["encoding"]["validation_size"],
test_size=self.pipeline_manager.variables["dataset_runner"]["encoding"]["test_size"],
save_plots=True,
save_path=self.save_path
)
return split_df, encoding_df