Source code for library.phases.runners.dataset_runner

import numpy as np

from library.utils.phase_runner_definition.phase_runner import PhaseRunner
from library.pipeline.pipeline_manager import PipelineManager


[docs]
class DatasetRunner(PhaseRunner):
      def __init__(self, pipeline_manager: PipelineManager, include_plots: bool = False, save_path: str = "") -> None:
            super().__init__(pipeline_manager, include_plots, save_path)

            

            
      def _ensembled_pipeline_feature_engineering(self) -> None:
            # FEDE (expand here)
            ensembled_pipeline = self.pipeline_manager.pipelines["not_baseline"]["ensembled"]
            df = ensembled_pipeline.dataset.df

            #df["Name_of_some_column"] = np.log(df["Name_of_some_column"])
      
      def _run_feature_engineering_pre_split(self) -> None:
            """

            TO READ BY FEDE:
                  - Each pipelin has its own function (just to make it easier to read)
                  - Above you have an example of applying log transformation for a specific column
                  - Note that currettly all pipelines are the same because divergance is done later in modelling (and juan probably will put it up to data preprocessing). If u want
                  to do somethign here that is pipeline specific you need to create the divergence (below you have a function that is all-commented out on showing how to diverge
                  all pipelines, as an example)
            
            
            """
            # FEDE (expand here)
            # pre split
            self._ensembled_pipeline_feature_engineering()
      



[docs]
      def run(self) -> None:
            # Select the first pipeline.
            print(self.pipeline_manager.pipelines)
            pipelines = list(self.pipeline_manager.pipelines["not_baseline"].values())
            default_pipeline = pipelines[0]

            feature_engineering_results = self._run_feature_engineering_pre_split()

            split_df = default_pipeline.dataset.split.asses_split_classifier(
                        p=self.pipeline_manager.variables["dataset_runner"]["split_df"]["p"], 
                        step=self.pipeline_manager.variables["dataset_runner"]["split_df"]["step"],
                        save_plots=self.include_plots,
                        save_path=self.save_path
                        )

            encoding_df = default_pipeline.dataset.split.split_data(
                        y_column=self.pipeline_manager.variables["dataset_runner"]["encoding"]["y_column"],
                        train_size=self.pipeline_manager.variables["dataset_runner"]["encoding"]["train_size"],
                        validation_size=self.pipeline_manager.variables["dataset_runner"]["encoding"]["validation_size"],
                        test_size=self.pipeline_manager.variables["dataset_runner"]["encoding"]["test_size"],
                        save_plots=True, 
                        save_path=self.save_path
                  )
            return split_df, encoding_df
Source code for library.phases.runners.dataset_runner

Efficient Malware Classfier

Navigation

Related Topics