diff --git a/code/bolsonaro/models/kmeans_forest_regressor.py b/code/bolsonaro/models/kmeans_forest_regressor.py index dc82b3b03c02c23a05e0cb41e0adf8ae1d9f8416..a1a3dee940844a1e48a5fbd5df416bdea6eae903 100644 --- a/code/bolsonaro/models/kmeans_forest_regressor.py +++ b/code/bolsonaro/models/kmeans_forest_regressor.py @@ -16,75 +16,63 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan. """ - def __init__(self, models_parameters): + def __init__(self, models_parameters, score_metric=mean_squared_error): self._models_parameters = models_parameters - self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], - random_state=models_parameters.seed, n_jobs=-1) + self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, + random_state=self._models_parameters.seed, n_jobs=-1) self._extracted_forest_size = self._models_parameters.extracted_forest_size + self._score_metric = score_metric @property def models_parameters(self): return self._models_parameters - def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): - self._regressor.fit(X_train, y_train) + def fit(self, X_train, y_train, X_val, y_val): + self._estimator.fit(X_train, y_train) predictions = list() - for tree in self._regressor.estimators_: + for tree in self._estimator.estimators_: predictions.append(tree.predict(X_train)) predictions = np.array(predictions) kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) labels = np.array(kmeans.labels_) - # for each cluster select the best tree on the validation set - - """ - pruned_forest = list() - for c in range(self._extracted_forest_size): - index = np.where(labels == c)[0] - cluster = list() - for i in index: - y_val_pred = self._regressor.estimators_[i].predict(X_val) - tree_pred = score_metric(y_val, y_val_pred) - cluster.append(tree_pred) - best_tree_index = np.argmax(cluster) - pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])""" - + # For each cluster select the best tree on the validation set extracted_forest_sizes = list(range(self._extracted_forest_size)) - with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=False)) as prune_forest_job_pb: + with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb: pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb, - extracted_forest_sizes[i], labels, X_val, y_val, score_metric) + extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric) for i in range(self._extracted_forest_size)) - self._regressor.estimators_ = pruned_forest + self._estimator.estimators_ = pruned_forest def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): index = np.where(labels == c)[0] - with tqdm_joblib(tqdm(total=len(index), disable=False)) as cluster_job_pb: + with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb: cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, y_val, score_metric) for i in range(len(index))) best_tree_index = np.argmax(cluster) prune_forest_job_pb.update() - return self._regressor.estimators_[index[best_tree_index]] + return self._estimator.estimators_[index[best_tree_index]] def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric): - y_val_pred = self._regressor.estimators_[i].predict(X_val) + y_val_pred = self._estimator.estimators_[i].predict(X_val) tree_pred = score_metric(y_val, y_val_pred) cluster_job_pb.update() return tree_pred def predict(self, X): - return self._regressor.predict(X) + return self._estimator.predict(X) def score(self, X, y): predictions = list() - for tree in self._regressor.estimators_: + for tree in self._estimator.estimators_: predictions.append(tree.predict(X)) predictions = np.array(predictions) mean_predictions = np.mean(predictions, axis=0) - score = mean_squared_error(mean_predictions, y) + score = self._score_metric(mean_predictions, y) return score def predict_base_estimator(self, X): - return self._regressor.predict(X) + return self._estimator.predict(X) diff --git a/code/bolsonaro/models/similarity_forest_regressor.py b/code/bolsonaro/models/similarity_forest_regressor.py index 8d8b5a16ff996f2a79502d23b9b677ce923f64bb..bbdb147bb58945b3f8ffa22181cd6db1d31bb732 100644 --- a/code/bolsonaro/models/similarity_forest_regressor.py +++ b/code/bolsonaro/models/similarity_forest_regressor.py @@ -1,8 +1,12 @@ +from bolsonaro.utils import tqdm_joblib + from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error from sklearn.base import BaseEstimator from abc import abstractmethod, ABCMeta import numpy as np +from joblib import Parallel, delayed +from tqdm import tqdm class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): @@ -10,24 +14,25 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ """ - def __init__(self, models_parameters): + def __init__(self, models_parameters, score_metric=mean_squared_error): self._models_parameters = models_parameters - self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], - random_state=models_parameters.seed) + self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters, + random_state=self._models_parameters.seed, n_jobs=-1) self._extracted_forest_size = self._models_parameters.extracted_forest_size + self._score_metric = score_metric @property def models_parameters(self): return self._models_parameters - def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): - self._regressor.fit(X_train, y_train) + def fit(self, X_train, y_train, X_val, y_val): + self._estimator.fit(X_train, y_train) - y_val_pred = self._regressor.predict(X_val) - forest_pred = score_metric(y_val, y_val_pred) - forest = self._regressor.estimators_ + y_val_pred = self._estimator.predict(X_val) + forest_pred = self._score_metric(y_val, y_val_pred) + forest = self._estimator.estimators_ selected_trees = list() - tree_list = list(self._regressor.estimators_) + tree_list = list(self._estimator.estimators_) for _ in range(self._extracted_forest_size): best_similarity = 100000 @@ -35,13 +40,14 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): for i in range(len(tree_list)): lonely_tree = tree_list[i] del tree_list[i] - val_list = list() - for tree in tree_list: - val_pred = tree.predict(X_val) - val_list.append(val_pred) + begin_time = time.time() + with tqdm_joblib(tqdm(total=len(tree_list), disable=True)) as job_pb: + val_list = Parallel(n_jobs=-1)(delayed(self._tree_predict_job)( + job_pb, tree_list[i], X_val) + for i in range(len(tree_list))) val_list = np.array(val_list) val_mean = np.mean(val_list, axis=0) - val_score = score_metric(val_mean, y_val) + val_score = self._score_metric(val_mean, y_val) temp_similarity = abs(forest_pred - val_score) if (temp_similarity < best_similarity): found_index = i @@ -51,17 +57,21 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): del tree_list[found_index] pruned_forest = list(set(forest) - set(selected_trees)) - self._regressor.estimators_ = pruned_forest + self._estimator.estimators_ = pruned_forest + + def _tree_predict_job(self, job_pb, tree, X_val): + val_pred = tree.predict(X_val) + return val_pred def score(self, X, y): test_list = list() - for mod in self._regressor.estimators_: + for mod in self._estimator.estimators_: test_pred = mod.predict(X) test_list.append(test_pred) test_list = np.array(test_list) test_mean = np.mean(test_list, axis=0) - score = mean_squared_error(test_mean, y) + score = self._score_metric(test_mean, y) return score def predict_base_estimator(self, X): - return self._regressor.predict(X) + return self._estimator.predict(X) diff --git a/code/compute_results.py b/code/compute_results.py index 408a76a2b1922b1b272c48e1370d76b8aa40c8e6..01d710e0c3a05f792ced181be62e13c9091171ac 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -412,7 +412,7 @@ if __name__ == "__main__": raise ValueError('Score metrics of all experiments must be the same.') experiments_score_metric = base_with_params_experiment_score_metric - output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4') + output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans') pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) Plotter.plot_stage2_losses( diff --git a/code/train.py b/code/train.py index 7df811d20c1413004413a1a2d9c151fea8a1ad8d..66761abc0777eb868fa86ae5f1272db79f11a451 100644 --- a/code/train.py +++ b/code/train.py @@ -167,6 +167,7 @@ if __name__ == "__main__": DEFAULT_SKIP_BEST_HYPERPARAMS = False DEFAULT_JOB_NUMBER = -1 DEFAULT_EXTRACTION_STRATEGY = 'omp' + DEFAULT_OVERWRITE = False begin_random_seed_range = 1 end_random_seed_range = 2000 @@ -193,6 +194,7 @@ if __name__ == "__main__": parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.') + parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id') args = parser.parse_args() if args.experiment_configuration: @@ -249,7 +251,8 @@ if __name__ == "__main__": if args.experiment_id: experiment_id = args.experiment_id - #shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True) + if args.overwrite: + shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True) else: # Resolve the next experiment id number (last id + 1) experiment_id = resolve_experiment_id(parameters['models_dir'])