Commit 4ad6e725 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Merge branch '15-integration-sota' into 'master'

Resolve "integration-sota"

See merge request !20
parents 1db36b5d 462e76fa
* Fix model results loading in compute_results.py.
* Check that omp multiclasses classifier is working as expected.
* Fix the dataset error of fetcher when job_number > 1.
\ No newline at end of file
* Check that omp multiclasses classifier is working as expected.
\ No newline at end of file
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.tree import DecisionTreeRegressor
from abc import abstractmethod, ABCMeta
import numpy as np
from tqdm import tqdm
class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
'Ensemble selection from libraries of models' by Rich Caruana et al
"""
def __init__(self, models_parameters, library, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._library = library
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
@property
def library(self):
return self._library
def fit(self, X_train, y_train, X_val, y_val):
scores_list = list()
for estimator in self._library:
val_score = self._score_metric(estimator.predict(X_val), y_val)
scores_list.append(val_score)
class_list = list(self._library)
m = np.argmax(np.asarray(scores_list))
self._ensemble_selected = [class_list[m]]
temp_pred = class_list[m].predict(X_val)
del class_list[m]
for k in range(self._extracted_forest_size - 1):
candidate_index = 0
best_score = 100000
for j in range(len(class_list)):
temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val)))
temp_mean = np.mean(temp_pred, axis=0)
temp_score = self._score_metric(temp_mean, y_val)
if (temp_score < best_score):
candidate_index = j
best_score = temp_score
temp_pred = np.delete(temp_pred, -1, 0)
self._ensemble_selected.append(class_list[candidate_index])
temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
del class_list[candidate_index]
def score(self, X, y):
predictions = self._predict_base_estimator(X)
return self._score_metric(predictions, y)
def predict_base_estimator(self, X):
predictions = list()
for tree in self._ensemble_selected:
predictions.append(tree.predict(X))
mean_predictions = np.mean(np.array(predictions), axis=0)
return mean_predictions
@staticmethod
def generate_library(X_train, y_train, random_state=None):
criterion_arr = ["mse"]#, "friedman_mse", "mae"]
splitter_arr = ["best"]#, "random"]
depth_arr = [i for i in range(5, 20, 1)]
min_samples_split_arr = [i for i in range(2, 20, 1)]
min_samples_leaf_arr = [i for i in range(2, 20, 1)]
max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"]
library = list()
with tqdm(total=len(criterion_arr) * len(splitter_arr) * \
len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \
len(max_features_arr)) as bar:
bar.set_description('Generating library')
for criterion in criterion_arr:
for splitter in splitter_arr:
for depth in depth_arr:
for min_samples_split in min_samples_split_arr:
for min_samples_leaf in min_samples_leaf_arr:
for max_features in max_features_arr:
t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split,
min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state)
t.fit(X_train, y_train)
library.append(t)
bar.update(1)
return library
......@@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
......@@ -13,7 +14,7 @@ import pickle
class ModelFactory(object):
@staticmethod
def build(task, model_parameters):
def build(task, model_parameters, library=None):
if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]:
raise ValueError("Unsupported task '{}'".format(task))
......@@ -21,10 +22,10 @@ class ModelFactory(object):
if model_parameters.extraction_strategy == 'omp':
return OmpForestBinaryClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
......@@ -32,14 +33,16 @@ class ModelFactory(object):
if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size,
return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'ensemble':
return EnsembleSelectionForestRegressor(model_parameters, library=library)
elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
......@@ -47,10 +50,10 @@ class ModelFactory(object):
if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
......@@ -123,7 +123,9 @@ class SingleOmpForest(OmpForest):
forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
......@@ -134,16 +136,15 @@ class SingleOmpForest(OmpForest):
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
:return: a np.array of the predictions of the trees selected by OMP without applyong the weight
"""
forest_predictions = self._base_estimator_predictions(X).T
forest_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_])
if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)[0]
select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
print(len(omp_trees_indices))
select_trees = np.mean(forest_predictions[weights != 0], axis=0)
return select_trees
......@@ -37,12 +37,12 @@ class OmpForestBinaryClassifier(SingleOmpForest):
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_])
if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)
omp_trees_predictions = forest_predictions[omp_trees_indices].T[1]
omp_trees_predictions = forest_predictions[weights != 0].T[1]
# Here forest_pred is the probability of being class 1.
......@@ -119,7 +119,9 @@ class OmpForestMulticlassClassifier(OmpForest):
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
label_names = []
preds = []
......
......@@ -2,6 +2,8 @@ from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task
from . import LOG_PATH
......@@ -72,20 +74,25 @@ class Trainer(object):
else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
def train(self, model):
def train(self, model, extracted_forest_size=None):
"""
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
:return:
"""
self._logger.debug('Training model using train set...')
self._begin_time = time.time()
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
model.fit(
X=self._X_forest,
y=self._y_forest
)
if extracted_forest_size is not None:
estimators_index = np.arange(len(model.estimators_))
np.random.shuffle(estimators_index)
choosen_estimators = estimators_index[:extracted_forest_size]
model.estimators_ = np.array(model.estimators_)[choosen_estimators]
else:
model.fit(
X=self._X_forest,
y=self._y_forest
)
else:
model.fit(
self._X_forest,
......@@ -96,7 +103,7 @@ class Trainer(object):
self._end_time = time.time()
def __score_func(self, model, X, y_true, weights=True):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
if type(model) in [OmpForestRegressor, RandomForestRegressor]:
if weights:
y_pred = model.predict(X)
else:
......@@ -109,12 +116,14 @@ class Trainer(object):
y_pred = model.predict_no_weights(X)
if type(model) is OmpForestBinaryClassifier:
y_pred = np.sign(y_pred)
y_pred = np.where(y_pred==0, 1, y_pred)
y_pred = np.where(y_pred == 0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred)
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
result = model.score(X, y_true)
return result
def __score_func_base(self, model, X, y_true):
if type(model) == OmpForestRegressor:
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
......@@ -123,7 +132,7 @@ class Trainer(object):
elif type(model) == RandomForestClassifier:
y_pred = model.predict(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
elif type(model) is RandomForestRegressor:
y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred)
return result
......
......@@ -133,10 +133,11 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].')
parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \
parser.add_argument('--experiment_ids', nargs='+', type=str, required=True, help='Compute the results of the specified experiment id(s).' + \
'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}' + \
'stage=2: {{no_normalization}} {{normalize_D}} {{normalize_weights}} {{normalize_D_and_weights}}' + \
'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}')
'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}' + \
'stage=5: {{base_with_params}} {{random_with_params}} {{omp_with_params}} [ensemble={{id}}] [similarity={{id}}] [kmean={{id}}]')
parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
......@@ -159,7 +160,7 @@ if __name__ == "__main__":
raise ValueError('In the case of stage 1, the number of specified experiment ids must be 6.')
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1]))
# Experiments that used the best hyperparameters found for this dataset
......@@ -167,18 +168,18 @@ if __name__ == "__main__":
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]),
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1]))
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
args.models_dir, args.results_dir, int(args.experiment_ids[2]))
# Experiments that didn't use the best hyperparameters found for this dataset
......@@ -186,19 +187,19 @@ if __name__ == "__main__":
logger.info('Loading base_wo_params experiment scores...')
base_wo_params_train_scores, base_wo_params_dev_scores, base_wo_params_test_scores, \
base_wo_params_experiment_score_metric = extract_scores_across_seeds_and_forest_size(
args.models_dir, args.results_dir, args.experiment_ids[3],
args.models_dir, args.results_dir, int(args.experiment_ids[3]),
extracted_forest_sizes_number)
# random_wo_params
logger.info('Loading random_wo_params experiment scores...')
random_wo_params_train_scores, random_wo_params_dev_scores, random_wo_params_test_scores, \
wo_params_extracted_forest_sizes, random_wo_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[4])
args.models_dir, args.results_dir, int(args.experiment_ids[4]))
# base_wo_params
logger.info('Loading base_wo_params experiment scores...')
omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _, \
omp_wo_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[5])
args.models_dir, args.results_dir, int(args.experiment_ids[5]))
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric ==
......@@ -243,25 +244,25 @@ if __name__ == "__main__":
logger.info('Loading no_normalization experiment scores...')
_, _, no_normalization_test_scores, extracted_forest_sizes, no_normalization_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[0])
int(args.experiment_ids[0]))
# normalize_D
logger.info('Loading normalize_D experiment scores...')
_, _, normalize_D_test_scores, _, normalize_D_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[1])
int(args.experiment_ids[1]))
# normalize_weights
logger.info('Loading normalize_weights experiment scores...')
_, _, normalize_weights_test_scores, _, normalize_weights_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[2])
int(args.experiment_ids[2]))
# normalize_D_and_weights
logger.info('Loading normalize_D_and_weights experiment scores...')
_, _, normalize_D_and_weights_test_scores, _, normalize_D_and_weights_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[3])
int(args.experiment_ids[3]))
# Sanity check on the metrics retreived
if not (no_normalization_experiment_score_metric == normalize_D_experiment_score_metric
......@@ -290,21 +291,21 @@ if __name__ == "__main__":
train_dev_subset_train_scores, train_dev_subset_dev_scores, train_dev_subset_test_scores, \
extracted_forest_sizes, train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[0])
int(args.experiment_ids[0]))
# train-dev_train-dev_subset
logger.info('Loading train-dev_train-dev_subset experiment scores...')
train_dev_train_dev_subset_train_scores, train_dev_train_dev_subset_dev_scores, train_dev_train_dev_subset_test_scores, \
_, train_dev_train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[1])
int(args.experiment_ids[1]))
# train-train-dev_subset
logger.info('Loading train-train-dev_subset experiment scores...')
train_train_dev_subset_train_scores, train_train_dev_subset_dev_scores, train_train_dev_subset_test_scores, \
_, train_train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[2])
int(args.experiment_ids[2]))
# Sanity check on the metrics retreived
if not (train_dev_subset_experiment_score_metric == train_dev_train_dev_subset_experiment_score_metric
......@@ -349,13 +350,13 @@ if __name__ == "__main__":
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]),
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1]))
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
"""omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
......@@ -363,12 +364,12 @@ if __name__ == "__main__":
args.models_dir, args.results_dir, args.experiment_ids[2])"""
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
args.models_dir, args.results_dir, int(args.experiment_ids[2]))
#omp_with_params_without_weights
logger.info('Loading omp_with_params experiment scores...')
logger.info('Loading omp_no_weights experiment scores...')
omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2], weights=False)
args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False)
"""# base_with_params
logger.info('Loading base_with_params experiment scores 2...')
......@@ -388,7 +389,7 @@ if __name__ == "__main__":
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4_fix')
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
......@@ -402,47 +403,63 @@ if __name__ == "__main__":
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
elif args.stage == 5:
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1]))
all_labels = list()
all_scores = list()
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]),
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1]))
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
# omp_with_params
logger.info('Loading kmeans_with_params experiment scores...')
kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \
kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[3])
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric
== omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
args.models_dir, args.results_dir, int(args.experiment_ids[2]))
#omp_with_params_without_weights
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False)
all_labels = ['base', 'random', 'omp', 'omp_without_weights']
all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
omp_with_params_without_weights_test_scores]
for i in range(3, len(args.experiment_ids)):
if 'kmeans' in args.experiment_ids[i]:
label = 'kmeans'
elif 'similarity' in args.experiment_ids[i]:
label = 'similarity'
elif 'ensemble' in args.experiment_ids[i]:
label = 'ensemble'
else:
logger.error('Invalid value encountered')
continue
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans')
logger.info(f'Loading {label} experiment scores...')
_, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[i].split('=')[1]))
all_labels.append(label)
all_scores.append(current_test_scores)
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
kmeans_with_params_test_scores],
all_labels=['base', 'random', 'omp', 'kmeans'],
file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}.png",
all_experiment_scores=all_scores,
all_labels=all_labels,
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
ylabel=base_with_params_experiment_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
else:
raise ValueError('This stage number is not supported yet, but it will be!')
......
......@@ -2,6 +2,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.trainer import Trainer
from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
from bolsonaro import LOG_PATH
......@@ -9,6 +10,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory
from dotenv import find_dotenv, load_dotenv
import argparse
import copy
import json
import pathlib
import random
......@@ -53,10 +55,37 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
trainer = Trainer(dataset)
if parameters['extraction_strategy'] == 'ensemble':
library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed)
else:
library = None
if parameters['extraction_strategy'] == 'random':
pretrained_model_parameters = ModelParameters(
extracted_forest_size=parameters['forest_size'],
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters, library=library)
pretraned_trainer = Trainer(dataset)
pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
pretrained_estimator.fit(
X=pretraned_trainer._X_forest,
y=pretraned_trainer._y_forest
)
else:
pretrained_estimator = None
pretrained_model_parameters = None
if parameters['extraction_strategy'] != 'none':
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer)