Commit f077de3c authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Merge branch 'master' into 19-add-some-tests

parents 5cf23c9b c04845a9
...@@ -16,6 +16,7 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -16,6 +16,7 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
self._library = library self._library = library
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric self._score_metric = score_metric
self._selected_trees = list()
@property @property
def models_parameters(self): def models_parameters(self):
...@@ -25,6 +26,10 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -25,6 +26,10 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
def library(self): def library(self):
return self._library return self._library
@property
def selected_trees(self):
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val): def fit(self, X_train, y_train, X_val, y_val):
scores_list = list() scores_list = list()
for estimator in self._library: for estimator in self._library:
...@@ -33,7 +38,7 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -33,7 +38,7 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
class_list = list(self._library) class_list = list(self._library)
m = np.argmax(np.asarray(scores_list)) m = np.argmax(np.asarray(scores_list))
self._ensemble_selected = [class_list[m]] self._selected_trees = [class_list[m]]
temp_pred = class_list[m].predict(X_val) temp_pred = class_list[m].predict(X_val)
del class_list[m] del class_list[m]
for k in range(self._extracted_forest_size - 1): for k in range(self._extracted_forest_size - 1):
...@@ -47,17 +52,17 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -47,17 +52,17 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
candidate_index = j candidate_index = j
best_score = temp_score best_score = temp_score
temp_pred = np.delete(temp_pred, -1, 0) temp_pred = np.delete(temp_pred, -1, 0)
self._ensemble_selected.append(class_list[candidate_index]) self._selected_trees.append(class_list[candidate_index])
temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val))) temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val)))
del class_list[candidate_index] del class_list[candidate_index]
def score(self, X, y): def score(self, X, y):
predictions = self._predict_base_estimator(X) predictions = self.predict_base_estimator(X)
return self._score_metric(predictions, y) return self._score_metric(predictions, y)
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
predictions = list() predictions = list()
for tree in self._ensemble_selected: for tree in self._selected_trees:
predictions.append(tree.predict(X)) predictions.append(tree.predict(X))
mean_predictions = np.mean(np.array(predictions), axis=0) mean_predictions = np.mean(np.array(predictions), axis=0)
return mean_predictions return mean_predictions
......
...@@ -22,11 +22,16 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -22,11 +22,16 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
random_state=self._models_parameters.seed, n_jobs=-1) random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric self._score_metric = score_metric
self._selected_trees = list()
@property @property
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
@property
def selected_trees(self):
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val): def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train) self._estimator.fit(X_train, y_train)
...@@ -45,6 +50,7 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -45,6 +50,7 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
for i in range(self._extracted_forest_size)) for i in range(self._extracted_forest_size))
self._selected_trees = pruned_forest
self._estimator.estimators_ = pruned_forest self._estimator.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
......
from bolsonaro import LOG_PATH from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.utils import omp_premature_warning
from abc import abstractmethod, ABCMeta from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
import warnings
class OmpForest(BaseEstimator, metaclass=ABCMeta): class OmpForest(BaseEstimator, metaclass=ABCMeta):
...@@ -13,6 +15,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): ...@@ -13,6 +15,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
self._base_forest_estimator = base_forest_estimator self._base_forest_estimator = base_forest_estimator
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
self._selected_trees = list()
@property @property
def models_parameters(self): def models_parameters(self):
...@@ -109,7 +112,17 @@ class SingleOmpForest(OmpForest): ...@@ -109,7 +112,17 @@ class SingleOmpForest(OmpForest):
super().__init__(models_parameters, base_forest_estimator) super().__init__(models_parameters, base_forest_estimator)
def fit_omp(self, atoms, objective): def fit_omp(self, atoms, objective):
self._omp.fit(atoms, objective) with warnings.catch_warnings(record=True) as caught_warnings:
# Cause all warnings to always be triggered.
warnings.simplefilter("always")
self._omp.fit(atoms, objective)
# ignore any non-custom warnings that may be in the list
caught_warnings = list(filter(lambda i: i.message != RuntimeWarning(omp_premature_warning), caught_warnings))
if len(caught_warnings) > 0:
logger.error(f'number of linear dependences in the dictionary: {len(caught_warnings)}. model parameters: {str(self._models_parameters.__dict__)}')
def predict(self, X): def predict(self, X):
""" """
...@@ -123,9 +136,7 @@ class SingleOmpForest(OmpForest): ...@@ -123,9 +136,7 @@ class SingleOmpForest(OmpForest):
forest_predictions = self._base_estimator_predictions(X) forest_predictions = self._base_estimator_predictions(X)
if self._models_parameters.normalize_D: if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
...@@ -136,7 +147,7 @@ class SingleOmpForest(OmpForest): ...@@ -136,7 +147,7 @@ class SingleOmpForest(OmpForest):
Make all the base tree predictions Make all the base tree predictions
:param X: a Forest :param X: a Forest
:return: a np.array of the predictions of the trees selected by OMP without applyong the weight :return: a np.array of the predictions of the trees selected by OMP without applying the weight
""" """
forest_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]) forest_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_])
......
from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest from bolsonaro.models.omp_forest import OmpForest, SingleOmpForest
from bolsonaro.utils import binarize_class_data from bolsonaro.utils import binarize_class_data, omp_premature_warning
import numpy as np import numpy as np
from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import OrthogonalMatchingPursuit from sklearn.linear_model import OrthogonalMatchingPursuit
import warnings
class OmpForestBinaryClassifier(SingleOmpForest): class OmpForestBinaryClassifier(SingleOmpForest):
...@@ -92,7 +93,19 @@ class OmpForestMulticlassClassifier(OmpForest): ...@@ -92,7 +93,19 @@ class OmpForestMulticlassClassifier(OmpForest):
omp_class = OrthogonalMatchingPursuit( omp_class = OrthogonalMatchingPursuit(
n_nonzero_coefs=self.models_parameters.extracted_forest_size, n_nonzero_coefs=self.models_parameters.extracted_forest_size,
fit_intercept=True, normalize=False) fit_intercept=True, normalize=False)
omp_class.fit(atoms_binary, objective_binary)
with warnings.catch_warnings(record=True) as caught_warnings:
# Cause all warnings to always be triggered.
warnings.simplefilter("always")
omp_class.fit(atoms_binary, objective_binary)
# ignore any non-custom warnings that may be in the list
caught_warnings = list(filter(lambda i: i.message != RuntimeWarning(omp_premature_warning), caught_warnings))
if len(caught_warnings) > 0:
logger.error(f'number of linear dependences in the dictionary: {len(caught_warnings)}. model parameters: {str(self._models_parameters.__dict__)}')
self._dct_class_omp[class_label] = omp_class self._dct_class_omp[class_label] = omp_class
return self._dct_class_omp return self._dct_class_omp
...@@ -119,9 +132,7 @@ class OmpForestMulticlassClassifier(OmpForest): ...@@ -119,9 +132,7 @@ class OmpForestMulticlassClassifier(OmpForest):
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
if self._models_parameters.normalize_D: if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
label_names = [] label_names = []
preds = [] preds = []
...@@ -149,7 +160,9 @@ class OmpForestMulticlassClassifier(OmpForest): ...@@ -149,7 +160,9 @@ class OmpForestMulticlassClassifier(OmpForest):
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
if self._models_parameters.normalize_D: if self._models_parameters.normalize_D:
forest_predictions = forest_predictions.T
forest_predictions /= self._forest_norms forest_predictions /= self._forest_norms
forest_predictions = forest_predictions.T
label_names = [] label_names = []
preds = [] preds = []
......
...@@ -17,18 +17,22 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -17,18 +17,22 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
random_state=self._models_parameters.seed, n_jobs=-1) random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric self._score_metric = score_metric
self._selected_trees = list()
@property @property
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
@property
def selected_trees(self):
return self._selected_trees
def fit(self, X_train, y_train, X_val, y_val): def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train) self._estimator.fit(X_train, y_train)
y_val_pred = self._estimator.predict(X_val) y_val_pred = self._estimator.predict(X_val)
forest_pred = self._score_metric(y_val, y_val_pred) forest_pred = self._score_metric(y_val, y_val_pred)
forest = self._estimator.estimators_ forest = self._estimator.estimators_
selected_trees = list()
tree_list = list(self._estimator.estimators_) tree_list = list(self._estimator.estimators_)
val_scores = list() val_scores = list()
...@@ -57,12 +61,13 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -57,12 +61,13 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
tree_list.insert(j, lonely_tree) tree_list.insert(j, lonely_tree)
val_scores.insert(j, lonely_tree.predict(X_val)) val_scores.insert(j, lonely_tree.predict(X_val))
tree_list_bar.update(1) tree_list_bar.update(1)
selected_trees.append(tree_list[found_index]) self._selected_trees.append(tree_list[found_index])
del tree_list[found_index] del tree_list[found_index]
del val_scores[found_index] del val_scores[found_index]
pruning_forest_bar.update(1) pruning_forest_bar.update(1)
pruned_forest = list(set(forest) - set(selected_trees)) self._selected_trees = set(self._selected_trees)
pruned_forest = list(set(forest) - self._selected_trees)
self._estimator.estimators_ = pruned_forest self._estimator.estimators_ = pruned_forest
def score(self, X, y): def score(self, X, y):
......
...@@ -13,6 +13,8 @@ from sklearn.metrics import mean_squared_error, accuracy_score ...@@ -13,6 +13,8 @@ from sklearn.metrics import mean_squared_error, accuracy_score
import time import time
import datetime import datetime
import numpy as np import numpy as np
import os
import pickle
class Trainer(object): class Trainer(object):
...@@ -36,6 +38,7 @@ class Trainer(object): ...@@ -36,6 +38,7 @@ class Trainer(object):
else classification_score_metric.__name__ else classification_score_metric.__name__
self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else base_classification_score_metric.__name__ else base_classification_score_metric.__name__
self._selected_trees = ''
@property @property
def score_metric_name(self): def score_metric_name(self):
...@@ -93,6 +96,7 @@ class Trainer(object): ...@@ -93,6 +96,7 @@ class Trainer(object):
X=self._X_forest, X=self._X_forest,
y=self._y_forest y=self._y_forest
) )
self._selected_trees = model.estimators_
else: else:
model.fit( model.fit(
self._X_forest, self._X_forest,
...@@ -151,6 +155,13 @@ class Trainer(object): ...@@ -151,6 +155,13 @@ class Trainer(object):
elif type(model) == OmpForestBinaryClassifier: elif type(model) == OmpForestBinaryClassifier:
model_weights = model._omp model_weights = model._omp
if type(model) in [SimilarityForestRegressor, EnsembleSelectionForestRegressor, KMeansForestRegressor]:
self._selected_trees = model.selected_trees
if len(self._selected_trees) > 0:
with open(os.path.join(models_dir, 'selected_trees.pickle'), 'wb') as output_file:
pickle.dump(self._selected_trees, output_file)
results = ModelRawResults( results = ModelRawResults(
model_weights=model_weights, model_weights=model_weights,
training_time=self._end_time - self._begin_time, training_time=self._end_time - self._begin_time,
......
...@@ -124,3 +124,7 @@ def is_float(value): ...@@ -124,3 +124,7 @@ def is_float(value):
return True return True
except ValueError: except ValueError:
return False return False
omp_premature_warning = """ Orthogonal matching pursuit ended prematurely due to linear
dependence in the dictionary. The requested precision might not have been met.
"""
...@@ -156,6 +156,7 @@ if __name__ == "__main__": ...@@ -156,6 +156,7 @@ if __name__ == "__main__":
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
DEFAULT_PLOT_WEIGHT_DENSITY = False DEFAULT_PLOT_WEIGHT_DENSITY = False
DEFAULT_WO_LOSS_PLOTS = False
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].') parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].')
...@@ -168,6 +169,7 @@ if __name__ == "__main__": ...@@ -168,6 +169,7 @@ if __name__ == "__main__":
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.') parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.')
parser.add_argument('--wo_loss_plots', action='store_true', default=DEFAULT_WO_LOSS_PLOTS, help='Do not compute the loss plots.')
args = parser.parse_args() args = parser.parse_args()
if args.stage not in list(range(1, 6)): if args.stage not in list(range(1, 6)):
...@@ -181,7 +183,7 @@ if __name__ == "__main__": ...@@ -181,7 +183,7 @@ if __name__ == "__main__":
# Create recursively the results dir tree # Create recursively the results dir tree
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
if args.stage == 1: if args.stage == 1 and not args.wo_loss_plots:
if len(args.experiment_ids) != 6: if len(args.experiment_ids) != 6:
raise ValueError('In the case of stage 1, the number of specified experiment ids must be 6.') raise ValueError('In the case of stage 1, the number of specified experiment ids must be 6.')
...@@ -221,8 +223,8 @@ if __name__ == "__main__": ...@@ -221,8 +223,8 @@ if __name__ == "__main__":
wo_params_extracted_forest_sizes, random_wo_params_experiment_score_metric = \ wo_params_extracted_forest_sizes, random_wo_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes( extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[4])) args.models_dir, args.results_dir, int(args.experiment_ids[4]))
# base_wo_params # omp_wo_params
logger.info('Loading base_wo_params experiment scores...') logger.info('Loading omp_wo_params experiment scores...')
omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _, \ omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _, \
omp_wo_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( omp_wo_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[5])) args.models_dir, args.results_dir, int(args.experiment_ids[5]))
...@@ -262,7 +264,7 @@ if __name__ == "__main__": ...@@ -262,7 +264,7 @@ if __name__ == "__main__":
ylabel=experiments_score_metric, ylabel=experiments_score_metric,
title='Loss values of {}\nusing best and default hyperparameters'.format(args.dataset_name) title='Loss values of {}\nusing best and default hyperparameters'.format(args.dataset_name)
) )
elif args.stage == 2: elif args.stage == 2 and not args.wo_loss_plots:
if len(args.experiment_ids) != 4: if len(args.experiment_ids) != 4:
raise ValueError('In the case of stage 2, the number of specified experiment ids must be 4.') raise ValueError('In the case of stage 2, the number of specified experiment ids must be 4.')
...@@ -308,7 +310,7 @@ if __name__ == "__main__": ...@@ -308,7 +310,7 @@ if __name__ == "__main__":
xlabel='Number of trees extracted', xlabel='Number of trees extracted',
ylabel=experiments_score_metric, ylabel=experiments_score_metric,
title='Loss values of {}\nusing different normalizations'.format(args.dataset_name)) title='Loss values of {}\nusing different normalizations'.format(args.dataset_name))
elif args.stage == 3: elif args.stage == 3 and not args.wo_loss_plots:
if len(args.experiment_ids) != 3: if len(args.experiment_ids) != 3:
raise ValueError('In the case of stage 3, the number of specified experiment ids must be 3.') raise ValueError('In the case of stage 3, the number of specified experiment ids must be 3.')
...@@ -365,7 +367,7 @@ if __name__ == "__main__": ...@@ -365,7 +367,7 @@ if __name__ == "__main__":
xlabel='Number of trees extracted', xlabel='Number of trees extracted',
ylabel=experiments_score_metric, ylabel=experiments_score_metric,
title='Loss values of {}\nusing different training subsets'.format(args.dataset_name))""" title='Loss values of {}\nusing different training subsets'.format(args.dataset_name))"""
elif args.stage == 4: elif args.stage == 4 and not args.wo_loss_plots:
if len(args.experiment_ids) != 3: if len(args.experiment_ids) != 3:
raise ValueError('In the case of stage 4, the number of specified experiment ids must be 3.') raise ValueError('In the case of stage 4, the number of specified experiment ids must be 3.')
...@@ -427,11 +429,7 @@ if __name__ == "__main__": ...@@ -427,11 +429,7 @@ if __name__ == "__main__":
xlabel='Number of trees extracted', xlabel='Number of trees extracted',
ylabel=experiments_score_metric, ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
elif args.stage == 5 and not args.wo_loss_plots:
experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, args.experiment_ids[2])
Plotter.weight_density(experiment_weights, os.path.join(output_path, 'weight_density.png'))
elif args.stage == 5:
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1])) extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1]))
all_labels = list() all_labels = list()
...@@ -475,8 +473,9 @@ if __name__ == "__main__": ...@@ -475,8 +473,9 @@ if __name__ == "__main__":
continue continue
logger.info(f'Loading {label} experiment scores...') logger.info(f'Loading {label} experiment scores...')
current_experiment_id = int(args.experiment_ids[i].split('=')[1])
_, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes( _, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[i].split('=')[1])) args.models_dir, args.results_dir, current_experiment_id)
all_labels.append(label) all_labels.append(label)
all_scores.append(current_test_scores) all_scores.append(current_test_scores)
...@@ -491,7 +490,42 @@ if __name__ == "__main__": ...@@ -491,7 +490,42 @@ if __name__ == "__main__":
xlabel='Number of trees extracted', xlabel='Number of trees extracted',
ylabel=base_with_params_experiment_score_metric, ylabel=base_with_params_experiment_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
else:
raise ValueError('This stage number is not supported yet, but it will be!') if args.plot_weight_density:
root_output_path = os.path.join(args.results_dir, args.dataset_name, f'stage{args.stage}')
if args.stage == 1:
omp_experiment_ids = [('omp_with_params', args.experiment_ids[2]), ('omp_wo_params', args.experiment_ids[2])]
elif args.stage == 2:
omp_experiment_ids = [('no_normalization', args.experiment_ids[0]),
('normalize_D', args.experiment_ids[1]),
('normalize_weights', args.experiment_ids[2]),
('normalize_D_and_weights', args.experiment_ids[3])]
elif args.stage == 3:
omp_experiment_ids = [('train-dev_subset', args.experiment_ids[0]),
('train-dev_train-dev_subset', args.experiment_ids[1]),
('train-train-dev_subset', args.experiment_ids[2])]
elif args.stage == 4:
omp_experiment_ids = [('omp_with_params', args.experiment_ids[2])]
elif args.stage == 5:
omp_experiment_ids = [('omp_with_params', args.experiment_ids[2])]
for i in range(3, len(args.experiment_ids)):
if 'kmeans' in args.experiment_ids[i]:
label = 'kmeans'
elif 'similarity' in args.experiment_ids[i]:
label = 'similarity'
elif 'ensemble' in args.experiment_ids[i]:
label = 'ensemble'
else:
logger.error('Invalid value encountered')
continue
current_experiment_id = int(args.experiment_ids[i].split('=')[1])
omp_experiment_ids.append((label, current_experiment_id))
for (experiment_label, experiment_id) in omp_experiment_ids:
logger.info(f'Computing weight density plot for experiment {experiment_label}...')
experiment_weights = extract_weights_across_seeds(args.models_dir, args.results_dir, experiment_id)
Plotter.weight_density(experiment_weights, os.path.join(root_output_path, f'weight_density_{experiment_label}.png'))
logger.info('Done.') logger.info('Done.')
...@@ -141,7 +141,7 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz ...@@ -141,7 +141,7 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz
sub_models_dir_files = os.listdir(sub_models_dir) sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files: for file_name in sub_models_dir_files: