Source code for library.phases.runners.modelling.modelling_runner
import numpy as np
# Scikit-learn models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import StackingClassifier
from sklearn.utils.class_weight import compute_class_weight
# Self-developed models
from library.utils.ownModels.majorityClassModel import MajorityClassClassifier
from library.utils.ownModels.neuralNets.feedForward import FeedForwardNeuralNetwork
from library.utils.phase_runner_definition.phase_runner import PhaseRunner
from library.pipeline.pipeline_manager import PipelineManager
from library.phases.runners.modelling.utils.states.modelling_runner_states_pre import PreTuningRunner
from library.phases.runners.modelling.utils.states.modelling_runner_states_in import InTuningRunner
from library.phases.runners.modelling.utils.states.modelling_runner_states_post import PostTuningRunner
[docs]
class ModellingRunner(PhaseRunner):
def __init__(self, pipeline_manager: PipelineManager, include_plots: bool = False, save_path: str = "", serialize_results: bool = False) -> None:
super().__init__(pipeline_manager, include_plots, save_path)
self.serialize_results = serialize_results
self.variables = pipeline_manager.variables
def _model_initializers(self):
"""
We diverge all pipelines first (assuming it has not been done before, delete if it has @Juan or @Fede or @Cate).
Then add to each independent pipeline the given models.
Finally we call the function that excludes all the models that we do not want the training to run (either because we are trying to debug and want to run as fast as possible or
because we have observed that a certain model is not performing well and taking too long to fit/predict)
"""
nn_pipeline = self.pipeline_manager.pipelines["not_baseline"]["feed_forward_neural_network"]
# Check for predefined weights in config file
cw = self.variables["modelling_runner"]["class_weights"]
y = nn_pipeline.dataset.y_train.values
if cw is not None:
print("----\nSpecific weights have been provided! Will use them to fit the models\n----")
classes = np.array(list(cw.keys()), dtype=y.dtype)
class_weight_dict = {int(k): v for k, v in cw.items()}
priors = np.array([class_weight_dict[c] for c in classes])
priors = priors / priors.sum()
else:
# Compute weights with sklearn function (balanced)
print("----\nNo specific weights have been provided. Will create balanced weights authomatically.\n----")
classes = np.unique(y)
weights_array = compute_class_weight('balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, weights_array)) # to dict for scikit and keras
priors = weights_array / weights_array.sum() # For Naive Bayes (must add to 1)
# Ensembled models
self.pipeline_manager.pipelines["not_baseline"]["ensembled"].modelling.add_model("Gradient Boosting",
GradientBoostingClassifier())
self.pipeline_manager.pipelines["not_baseline"]["ensembled"].modelling.add_model("Random Forest",
RandomForestClassifier(
class_weight=class_weight_dict
))
# Tree-based models
self.pipeline_manager.pipelines["not_baseline"]["tree_based"].modelling.add_model("Decision Tree",
DecisionTreeClassifier(
class_weight=class_weight_dict
))
# Support Vector Machines models
self.pipeline_manager.pipelines["not_baseline"]["support_vector_machine"].modelling.add_model("Non-linear Support Vector Machine",
SVC())
self.pipeline_manager.pipelines["not_baseline"]["support_vector_machine"].modelling.add_model("Linear Support Vector Machine",
LinearSVC())
# Naive Bayes model
self.pipeline_manager.pipelines["not_baseline"]["naive_bayes"].modelling.add_model("Naive Bayes",
GaussianNB(
priors=priors
))
# Neural Network model
self.pipeline_manager.pipelines["not_baseline"]["feed_forward_neural_network"].modelling.add_model("Feed Forward Neural Network",
FeedForwardNeuralNetwork(
num_features=nn_pipeline.dataset.X_train.shape[1],
num_classes=nn_pipeline.dataset.y_train.value_counts().shape[0],
batch_size=self.pipeline_manager.variables["modelling_runner"]["neural_network"]["initial_architecture"]["batch_size"],
epochs=self.pipeline_manager.variables["modelling_runner"]["neural_network"]["initial_architecture"]["epochs"],
n_layers=self.pipeline_manager.variables["modelling_runner"]["neural_network"]["initial_architecture"]["n_layers"],
units_per_layer=self.pipeline_manager.variables["modelling_runner"]["neural_network"]["initial_architecture"]["units_per_layer"],
learning_rate=self.pipeline_manager.variables["modelling_runner"]["neural_network"]["initial_architecture"]["learning_rate"],
activations=self.pipeline_manager.variables["modelling_runner"]["neural_network"]["initial_architecture"]["activations"],
kernel_initializer=self.pipeline_manager.variables["modelling_runner"]["neural_network"]["initial_architecture"]["kernel_initializer"],
class_weights=class_weight_dict
),
model_type="neural_network")
# Baseline models
self.pipeline_manager.pipelines["baseline"]["baselines"].modelling.add_model("Logistic Regression (baseline)",
LogisticRegression(max_iter=1000))
self.pipeline_manager.pipelines["baseline"]["baselines"].modelling.add_model("Majority Class (baseline)",
MajorityClassClassifier())
self._exclude_models()
def _exclude_models(self):
# Ensembled models
self.pipeline_manager.pipelines["not_baseline"]["ensembled"].modelling.models_to_exclude = self.pipeline_manager.variables["modelling_runner"]["models_to_exclude"]["ensembled"]
# Tree-based models
self.pipeline_manager.pipelines["not_baseline"]["tree_based"].modelling.models_to_exclude = self.pipeline_manager.variables["modelling_runner"]["models_to_exclude"]["tree_based"]
# Support Vector Machines models
self.pipeline_manager.pipelines["not_baseline"]["support_vector_machine"].modelling.models_to_exclude = self.pipeline_manager.variables["modelling_runner"]["models_to_exclude"]["support_vector_machine"]
# Naive Bayes model
self.pipeline_manager.pipelines["not_baseline"]["naive_bayes"].modelling.models_to_exclude = self.pipeline_manager.variables["modelling_runner"]["models_to_exclude"]["naive_bayes"]
# Feed Forward Neural Network model
self.pipeline_manager.pipelines["not_baseline"]["feed_forward_neural_network"].modelling.models_to_exclude = self.pipeline_manager.variables["modelling_runner"]["models_to_exclude"]["feed_forward_neural_network"]
# Baseline models
self.pipeline_manager.pipelines["baseline"]["baselines"].modelling.models_to_exclude = self.pipeline_manager.variables["modelling_runner"]["models_to_exclude"]["baselines"]
[docs]
def run(self) -> None:
self._model_initializers()
pre_tuning_runner = PreTuningRunner(self.pipeline_manager,
save_plots=self.include_plots,
save_path=self.save_path)
pre_results = pre_tuning_runner.run()
print("-"*30)
print("STARTING IN TUNING")
print("-"*30)
in_tuning_runner = InTuningRunner(self.pipeline_manager,
save_plots=self.include_plots,
save_path=self.save_path)
in_results = in_tuning_runner.run()
print("-"*30)
print("STARTING POST TUNING")
print("-"*30)
post_tuning_runner = PostTuningRunner(self.pipeline_manager,
save_plots=self.include_plots,
save_path=self.save_path)
post_results = post_tuning_runner.run()
if self.serialize_results:
if self.pipeline_manager.variables["modelling_runner"]["serialize_models"]["serialize_best_performing_model"]:
self.pipeline_manager.serialize_models(models_to_serialize=self.pipeline_manager.best_performing_model["modelName"])
self.pipeline_manager.serialize_models(models_to_serialize=self.pipeline_manager.variables["modelling_runner"]["serialize_models"]["models_to_serialize"])
self.pipeline_manager.serialize_pipelines(pipelines_to_serialize=self.pipeline_manager.variables["modelling_runner"]["serialize_models"]["pipelines_to_serialize"])
return {"pre_tuning_runner": pre_results,
"in_tuning_runner": None,
"post_tuning_runner": None}