diff --git a/code/bolsonaro/models/kmeans_forest_regressor.py b/code/bolsonaro/models/kmeans_forest_regressor.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a3dee940844a1e48a5fbd5df416bdea6eae903 --- /dev/null +++ b/code/bolsonaro/models/kmeans_forest_regressor.py @@ -0,0 +1,78 @@ +from bolsonaro.utils import tqdm_joblib + +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error +from sklearn.base import BaseEstimator +from sklearn.cluster import KMeans +from abc import abstractmethod, ABCMeta +import numpy as np +from scipy.stats import mode +from joblib import Parallel, delayed +from tqdm import tqdm + + +class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): + """ + On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan. + """ + + def __init__(self, models_parameters, score_metric=mean_squared_error): + self._models_parameters = models_parameters + self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, + random_state=self._models_parameters.seed, n_jobs=-1) + self._extracted_forest_size = self._models_parameters.extracted_forest_size + self._score_metric = score_metric + + @property + def models_parameters(self): + return self._models_parameters + + def fit(self, X_train, y_train, X_val, y_val): + self._estimator.fit(X_train, y_train) + + predictions = list() + for tree in self._estimator.estimators_: + predictions.append(tree.predict(X_train)) + predictions = np.array(predictions) + + kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) + labels = np.array(kmeans.labels_) + + # For each cluster select the best tree on the validation set + extracted_forest_sizes = list(range(self._extracted_forest_size)) + with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: + pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb, + extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) + for i in range(self._extracted_forest_size)) + + self._estimator.estimators_ = pruned_forest + + def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): + index = np.where(labels == c)[0] + with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb: + cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, + y_val, score_metric) for i in range(len(index))) + best_tree_index = np.argmax(cluster) + prune_forest_job_pb.update() + return self._estimator.estimators_[index[best_tree_index]] + + def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric): + y_val_pred = self._estimator.estimators_[i].predict(X_val) + tree_pred = score_metric(y_val, y_val_pred) + cluster_job_pb.update() + return tree_pred + + def predict(self, X): + return self._estimator.predict(X) + + def score(self, X, y): + predictions = list() + for tree in self._estimator.estimators_: + predictions.append(tree.predict(X)) + predictions = np.array(predictions) + mean_predictions = np.mean(predictions, axis=0) + score = self._score_metric(mean_predictions, y) + return score + + def predict_base_estimator(self, X): + return self._estimator.predict(X) diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index 74993cc0a30b754595a490de40d69e064687bc24..bbda6cae89d218c7831780f71b9fc6a7bc022d54 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.data.task import Task from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier @@ -22,9 +23,11 @@ class ModelFactory(object): elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) - else: + elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) + else: + raise ValueError('Invalid extraction strategy') elif task == Task.REGRESSION: if model_parameters.extraction_strategy == 'omp': return OmpForestRegressor(model_parameters) @@ -33,15 +36,21 @@ class ModelFactory(object): random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'similarity': return SimilarityForestRegressor(model_parameters) - else: + elif model_parameters.extraction_strategy == 'kmeans': + return KMeansForestRegressor(model_parameters) + elif model_parameters.extraction_strategy == 'none': return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) + else: + raise ValueError('Invalid extraction strategy') elif task == Task.MULTICLASSIFICATION: if model_parameters.extraction_strategy == 'omp': return OmpForestMulticlassClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) - else: + elif model_parameters.extraction_strategy == 'none': return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) + else: + raise ValueError('Invalid extraction strategy') diff --git a/code/bolsonaro/models/similarity_forest_regressor.py b/code/bolsonaro/models/similarity_forest_regressor.py index f8d9c3ed349cf8c9e27acbcd7982694a65e11636..647e8695da88c0f84817a602471fd90f9bd1f1b0 100644 --- a/code/bolsonaro/models/similarity_forest_regressor.py +++ b/code/bolsonaro/models/similarity_forest_regressor.py @@ -3,6 +3,7 @@ from sklearn.metrics import mean_squared_error from sklearn.base import BaseEstimator from abc import abstractmethod, ABCMeta import numpy as np +from tqdm import tqdm class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): @@ -10,56 +11,69 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ """ - def __init__(self, models_parameters): + def __init__(self, models_parameters, score_metric=mean_squared_error): self._models_parameters = models_parameters - self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], - random_state=models_parameters.seed) + self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, + random_state=self._models_parameters.seed, n_jobs=-1) self._extracted_forest_size = self._models_parameters.extracted_forest_size + self._score_metric = score_metric @property def models_parameters(self): return self._models_parameters - def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): + def fit(self, X_train, y_train, X_val, y_val): + self._estimator.fit(X_train, y_train) - self._regressor.fit(X_train, y_train) - - y_val_pred = self._regressor.predict(X_val) - forest_pred = score_metric(y_val, y_val_pred) - forest = self._regressor.estimators_ + y_val_pred = self._estimator.predict(X_val) + forest_pred = self._score_metric(y_val, y_val_pred) + forest = self._estimator.estimators_ selected_trees = list() - tree_list = list(self._regressor.estimators_) + tree_list = list(self._estimator.estimators_) + + val_scores = list() + with tqdm(tree_list) as tree_pred_bar: + tree_pred_bar.set_description('[Initial tree predictions]') + for tree in tree_pred_bar: + val_scores.append(tree.predict(X_val)) + tree_pred_bar.update(1) - for _ in range(self._extracted_forest_size): - best_similarity = 100000 - found_index = 0 - for i in range(len(tree_list)): - lonely_tree = tree_list[i] - del tree_list[i] - val_list = list() - for tree in tree_list: - val_pred = tree.predict(X_val) - val_list.append(val_pred) - val_list = np.array(val_list) - val_mean = np.mean(val_list, axis=0) - val_score = score_metric(val_mean, y_val) - temp_similarity = abs(forest_pred - val_score) - if (temp_similarity < best_similarity): - found_index = i - best_similarity = temp_similarity - tree_list.insert(i, lonely_tree) - selected_trees.append(tree_list[found_index]) - del tree_list[found_index] + with tqdm(range(self._extracted_forest_size), disable=True) as pruning_forest_bar: + pruning_forest_bar.set_description(f'[Pruning forest s={self._extracted_forest_size}]') + for i in pruning_forest_bar: + best_similarity = 100000 + found_index = 0 + with tqdm(range(len(tree_list)), disable=True) as tree_list_bar: + tree_list_bar.set_description(f'[Tree selection s={self._extracted_forest_size} #{i}]') + for j in tree_list_bar: + lonely_tree = tree_list[j] + del tree_list[j] + val_mean = np.mean(np.asarray(val_scores), axis=0) + val_score = self._score_metric(val_mean, y_val) + temp_similarity = abs(forest_pred - val_score) + if (temp_similarity < best_similarity): + found_index = j + best_similarity = temp_similarity + tree_list.insert(j, lonely_tree) + val_scores.insert(j, lonely_tree.predict(X_val)) + tree_list_bar.update(1) + selected_trees.append(tree_list[found_index]) + del tree_list[found_index] + del val_scores[found_index] + pruning_forest_bar.update(1) pruned_forest = list(set(forest) - set(selected_trees)) - self._regressor.estimators_ = pruned_forest + self._estimator.estimators_ = pruned_forest def score(self, X, y): test_list = list() - for mod in self._regressor.estimators_: + for mod in self._estimator.estimators_: test_pred = mod.predict(X) test_list.append(test_pred) test_list = np.array(test_list) test_mean = np.mean(test_list, axis=0) - score = mean_squared_error(test_mean, y) + score = self._score_metric(test_mean, y) return score + + def predict_base_estimator(self, X): + return self._estimator.predict(X) diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index fc289afd1f8301197f5b1dd8be3bb134deca4a91..7070126e2a9a8f449757bdab9381b4bffab99b2d 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -1,192 +1,190 @@ -from bolsonaro.models.model_raw_results import ModelRawResults -from bolsonaro.models.omp_forest_regressor import OmpForestRegressor -from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier -from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor -from bolsonaro.error_handling.logger_factory import LoggerFactory -from bolsonaro.data.task import Task -from . import LOG_PATH - -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier -from sklearn.metrics import mean_squared_error, accuracy_score -import time -import datetime -import numpy as np - - -class Trainer(object): - """ - Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. - """ - - def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score, - base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score): - """ - - :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes - """ - self._dataset = dataset - self._logger = LoggerFactory.create(LOG_PATH, __name__) - self._regression_score_metric = regression_score_metric - self._classification_score_metric = classification_score_metric - self._base_regression_score_metric = base_regression_score_metric - self._base_classification_score_metric = base_classification_score_metric - self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ - else classification_score_metric.__name__ - self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ - else base_classification_score_metric.__name__ - - @property - def score_metric_name(self): - return self._score_metric_name - - @property - def base_score_metric_name(self): - return self._base_score_metric_name - - def init(self, model, subsets_used='train,dev'): - if type(model) in [RandomForestRegressor, RandomForestClassifier]: - if subsets_used == 'train,dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - else: - self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - self._logger.debug('Fitting the forest on train subset') - elif model.models_parameters.subsets_used == 'train,dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - self._X_omp = self._dataset.X_dev - self._y_omp = self._dataset.y_dev - self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') - elif model.models_parameters.subsets_used == 'train+dev,train+dev': - self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._X_omp = self._X_forest - self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - self._y_omp = self._y_forest - self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') - elif model.models_parameters.subsets_used == 'train,train+dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - else: - raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) - - def train(self, model): - """ - :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, - OmpForestBinaryClassifier, OmpForestMulticlassClassifier. - :return: - """ - - self._logger.debug('Training model using train set...') - self._begin_time = time.time() - if type(model) in [RandomForestRegressor, RandomForestClassifier]: - model.fit( - X=self._X_forest, - y=self._y_forest - ) - else: - model.fit( - self._X_forest, - self._y_forest, - self._X_omp, - self._y_omp - ) - self._end_time = time.time() - - def __score_func(self, model, X, y_true, weights=True): - if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: - if weights: - y_pred = model.predict(X) - else: - y_pred = model.predict_no_weights(X) - result = self._regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: - if weights: - y_pred = model.predict(X) - else: - y_pred = model.predict_no_weights(X) - if type(model) is OmpForestBinaryClassifier: - y_pred = np.sign(y_pred) - y_pred = np.where(y_pred==0, 1, y_pred) - result = self._classification_score_metric(y_true, y_pred) - return result - - def __score_func_base(self, model, X, y_true): - if type(model) == OmpForestRegressor: - y_pred = model.predict_base_estimator(X) - result = self._base_regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: - y_pred = model.predict_base_estimator(X) - result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) == RandomForestClassifier: - y_pred = model.predict(X) - result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: - y_pred = model.predict(X) - result = self._base_regression_score_metric(y_true, y_pred) - return result - - def compute_results(self, model, models_dir): - """ - :param model: Object with - :param models_dir: Where the results will be saved - """ - - model_weights = '' - if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]: - model_weights = model._omp.coef_ - elif type(model) == OmpForestMulticlassClassifier: - model_weights = model._dct_class_omp - elif type(model) == OmpForestBinaryClassifier: - model_weights = model._omp - - results = ModelRawResults( - model_weights=model_weights, - training_time=self._end_time - self._begin_time, - datetime=datetime.datetime.now(), - train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), - dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), - train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), - dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), - test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), - score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name - ) - results.save(models_dir) - self._logger.info("Base performance on test: {}".format(results.test_score_base)) - self._logger.info("Performance on test: {}".format(results.test_score)) - - self._logger.info("Base performance on train: {}".format(results.train_score_base)) - self._logger.info("Performance on train: {}".format(results.train_score)) - - self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) - self._logger.info("Performance on dev: {}".format(results.dev_score)) - - if type(model) not in [RandomForestRegressor, RandomForestClassifier]: - results = ModelRawResults( - model_weights='', - training_time=self._end_time - self._begin_time, - datetime=datetime.datetime.now(), - train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), - dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), - train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), - dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), - test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), - score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name - ) - results.save(models_dir+'_no_weights') - self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) - self._logger.info("Performance on test: {}".format(results.test_score)) - - self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) - self._logger.info("Performance on train: {}".format(results.train_score)) - - self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) - self._logger.info("Performance on dev: {}".format(results.dev_score)) - - +from bolsonaro.models.model_raw_results import ModelRawResults +from bolsonaro.models.omp_forest_regressor import OmpForestRegressor +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier +from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.data.task import Task +from . import LOG_PATH + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.metrics import mean_squared_error, accuracy_score +import time +import datetime +import numpy as np + + +class Trainer(object): + """ + Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. + """ + + def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score, + base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score): + """ + + :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes + """ + self._dataset = dataset + self._logger = LoggerFactory.create(LOG_PATH, __name__) + self._regression_score_metric = regression_score_metric + self._classification_score_metric = classification_score_metric + self._base_regression_score_metric = base_regression_score_metric + self._base_classification_score_metric = base_classification_score_metric + self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ + else classification_score_metric.__name__ + self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ + else base_classification_score_metric.__name__ + + @property + def score_metric_name(self): + return self._score_metric_name + + @property + def base_score_metric_name(self): + return self._base_score_metric_name + + def init(self, model, subsets_used='train,dev'): + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + if subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + else: + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._logger.debug('Fitting the forest on train subset') + elif model.models_parameters.subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = self._dataset.X_dev + self._y_omp = self._dataset.y_dev + self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') + elif model.models_parameters.subsets_used == 'train+dev,train+dev': + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._X_omp = self._X_forest + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._y_omp = self._y_forest + self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') + elif model.models_parameters.subsets_used == 'train,train+dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + else: + raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) + + def train(self, model): + """ + :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, + OmpForestBinaryClassifier, OmpForestMulticlassClassifier. + :return: + """ + + self._logger.debug('Training model using train set...') + self._begin_time = time.time() + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + model.fit( + X=self._X_forest, + y=self._y_forest + ) + else: + model.fit( + self._X_forest, + self._y_forest, + self._X_omp, + self._y_omp + ) + self._end_time = time.time() + + def __score_func(self, model, X, y_true, weights=True): + if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: + if weights: + y_pred = model.predict(X) + else: + y_pred = model.predict_no_weights(X) + result = self._regression_score_metric(y_true, y_pred) + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: + if weights: + y_pred = model.predict(X) + else: + y_pred = model.predict_no_weights(X) + if type(model) is OmpForestBinaryClassifier: + y_pred = np.sign(y_pred) + y_pred = np.where(y_pred==0, 1, y_pred) + result = self._classification_score_metric(y_true, y_pred) + return result + + def __score_func_base(self, model, X, y_true): + if type(model) == OmpForestRegressor: + y_pred = model.predict_base_estimator(X) + result = self._base_regression_score_metric(y_true, y_pred) + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: + y_pred = model.predict_base_estimator(X) + result = self._base_classification_score_metric(y_true, y_pred) + elif type(model) == RandomForestClassifier: + y_pred = model.predict(X) + result = self._base_classification_score_metric(y_true, y_pred) + elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: + y_pred = model.predict(X) + result = self._base_regression_score_metric(y_true, y_pred) + return result + + def compute_results(self, model, models_dir): + """ + :param model: Object with + :param models_dir: Where the results will be saved + """ + + model_weights = '' + if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]: + model_weights = model._omp.coef_ + elif type(model) == OmpForestMulticlassClassifier: + model_weights = model._dct_class_omp + elif type(model) == OmpForestBinaryClassifier: + model_weights = model._omp + + results = ModelRawResults( + model_weights=model_weights, + training_time=self._end_time - self._begin_time, + datetime=datetime.datetime.now(), + train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), + dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), + train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), + dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), + test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), + score_metric=self._score_metric_name, + base_score_metric=self._base_score_metric_name + ) + results.save(models_dir) + self._logger.info("Base performance on test: {}".format(results.test_score_base)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train: {}".format(results.train_score_base)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) + + if type(model) not in [RandomForestRegressor, RandomForestClassifier]: + results = ModelRawResults( + model_weights='', + training_time=self._end_time - self._begin_time, + datetime=datetime.datetime.now(), + train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), + dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), + train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), + dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), + test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), + score_metric=self._score_metric_name, + base_score_metric=self._base_score_metric_name + ) + results.save(models_dir+'_no_weights') + self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) diff --git a/code/compute_results.py b/code/compute_results.py index f15a7ff80249c538f2a408b564965de125b21cc4..5f7fac2c7718cf887d3d83a5b3a7eb9cdebfb9d9 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -400,23 +400,51 @@ if __name__ == "__main__": xlabel='Number of trees extracted', ylabel=experiments_score_metric, title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) + elif args.stage == 5: + # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary + extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1]) + + # base_with_params + logger.info('Loading base_with_params experiment scores...') + base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ + base_with_params_experiment_score_metric = \ + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extracted_forest_sizes_number) + # random_with_params + logger.info('Loading random_with_params experiment scores...') + random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ + with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + # omp_with_params + logger.info('Loading omp_with_params experiment scores...') + omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ + omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, args.experiment_ids[2]) + # omp_with_params + logger.info('Loading kmeans_with_params experiment scores...') + kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \ + kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, args.experiment_ids[3]) + + # Sanity check on the metrics retreived + if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric + == omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric): + raise ValueError('Score metrics of all experiments must be the same.') + experiments_score_metric = base_with_params_experiment_score_metric + + output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans') + pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) - # experiment_weights - #Plotter.weight_density(experiment_weights, output_path + os.sep + 'weight_density.png') + Plotter.plot_stage2_losses( + file_path=output_path + os.sep + 'losses.png', + all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, + kmeans_with_params_test_scores], + all_labels=['base', 'random', 'omp', 'kmeans'], + x_value=with_params_extracted_forest_sizes, + xlabel='Number of trees extracted', + ylabel=experiments_score_metric, + title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) else: raise ValueError('This stage number is not supported yet, but it will be!') logger.info('Done.') - - """ - TODO: - For each dataset: - Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams) - Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations) - Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev - Stage 4) A figure to finally compare the perf of our approach using the previous selected - parameters vs the baseline vs other papers using different extracted forest size - (percentage of the tree size found previously in best hyperparams search) on the abscissa. - - IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1). - """ diff --git a/code/train.py b/code/train.py index 8a7ccebed390e318ad74019aed2e14d704fe67dd..1d75e98b9044165abb075a346761a910d8479a83 100644 --- a/code/train.py +++ b/code/train.py @@ -21,7 +21,7 @@ import numpy as np import shutil -def process_job(seed, parameters, experiment_id, hyperparameters): +def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose): """ Experiment function. @@ -34,7 +34,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters): """ logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( seed, threading.get_ident())) - logger.info('seed={}'.format(seed)) seed_str = str(seed) experiment_id_str = str(experiment_id) @@ -55,13 +54,31 @@ def process_job(seed, parameters, experiment_id, hyperparameters): trainer = Trainer(dataset) if parameters['extraction_strategy'] != 'none': - for extracted_forest_size in parameters['extracted_forest_size']: - logger.info('extracted_forest_size={}'.format(extracted_forest_size)) - sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size) - pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) + with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: + Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i], + models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer) + for i in range(len(parameters['extracted_forest_size']))) + else: + forest_size = hyperparameters['n_estimators'] + logger.info('Base forest training with fixed forest size of {}'.format(forest_size)) + sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size) + # Check if the result file already exists + already_exists = False + if os.path.isdir(sub_models_dir): + sub_models_dir_files = os.listdir(sub_models_dir) + for file_name in sub_models_dir_files: + if '.pickle' != os.path.splitext(file_name)[1]: + continue + else: + already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0 + break + if already_exists: + logger.info('Base forest result already exists. Skipping...') + else: + pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) model_parameters = ModelParameters( - extracted_forest_size=extracted_forest_size, + extracted_forest_size=forest_size, normalize_D=parameters['normalize_D'], subsets_used=parameters['subsets_used'], normalize_weights=parameters['normalize_weights'], @@ -76,29 +93,50 @@ def process_job(seed, parameters, experiment_id, hyperparameters): trainer.init(model, subsets_used=parameters['subsets_used']) trainer.train(model) trainer.compute_results(model, sub_models_dir) - else: - forest_size = hyperparameters['n_estimators'] - logger.info('Base forest training with fixed forest size of {}'.format(forest_size)) - sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size) - pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) - - model_parameters = ModelParameters( - extracted_forest_size=forest_size, - normalize_D=parameters['normalize_D'], - subsets_used=parameters['subsets_used'], - normalize_weights=parameters['normalize_weights'], - seed=seed, - hyperparameters=hyperparameters, - extraction_strategy=parameters['extraction_strategy'] - ) - model_parameters.save(sub_models_dir, experiment_id) - - model = ModelFactory.build(dataset.task, model_parameters) - - trainer.init(model, subsets_used=parameters['subsets_used']) - trainer.train(model) - trainer.compute_results(model, sub_models_dir) - logger.info('Training done') + logger.info(f'Training done for seed {seed_str}') + seed_job_pb.update(1) + +def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir, + seed, parameters, dataset, hyperparameters, experiment_id, trainer): + + logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format( + seed, extracted_forest_size, threading.get_ident())) + logger.info('extracted_forest_size={}'.format(extracted_forest_size)) + + sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size) + + # Check if the result file already exists + already_exists = False + if os.path.isdir(sub_models_dir): + sub_models_dir_files = os.listdir(sub_models_dir) + for file_name in sub_models_dir_files: + if '.pickle' != os.path.splitext(file_name)[1]: + return + else: + already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0 + break + if already_exists: + logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...') + return + + pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) + + model_parameters = ModelParameters( + extracted_forest_size=extracted_forest_size, + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters, + extraction_strategy=parameters['extraction_strategy'] + ) + model_parameters.save(sub_models_dir, experiment_id) + + model = ModelFactory.build(dataset.task, model_parameters) + + trainer.init(model, subsets_used=parameters['subsets_used']) + trainer.train(model) + trainer.compute_results(model, sub_models_dir) """ Command lines example for stage 1: @@ -138,6 +176,7 @@ if __name__ == "__main__": DEFAULT_SKIP_BEST_HYPERPARAMS = False DEFAULT_JOB_NUMBER = -1 DEFAULT_EXTRACTION_STRATEGY = 'omp' + DEFAULT_OVERWRITE = False begin_random_seed_range = 1 end_random_seed_range = 2000 @@ -163,7 +202,8 @@ if __name__ == "__main__": parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') - parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.') + parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.') + parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id') args = parser.parse_args() if args.experiment_configuration: @@ -173,7 +213,7 @@ if __name__ == "__main__": else: parameters = args.__dict__ - if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']: + if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']: raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) @@ -220,7 +260,8 @@ if __name__ == "__main__": if args.experiment_id: experiment_id = args.experiment_id - shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True) + if args.overwrite: + shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True) else: # Resolve the next experiment id number (last id + 1) experiment_id = resolve_experiment_id(parameters['models_dir']) @@ -255,6 +296,6 @@ if __name__ == "__main__": ) # Run as much job as there are seeds - with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar: - Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i], - parameters, experiment_id, hyperparameters) for i in range(len(seeds))) + with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb: + Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i], + parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))