Skip to content
Snippets Groups Projects
Commit 0a97ff64 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Fix parallelization and estimator default hyperparams in kmeans and similarity...

Fix parallelization and estimator default hyperparams in kmeans and similarity methods. Fix on resume mode in train.py. Fix stage5 saving (tmp) in compute_results.py
parent be5bc24a
Branches
No related tags found
1 merge request!12Resolve "integration-sota"
This commit is part of merge request !12. Comments created here will be created in the context of that merge request.
...@@ -16,75 +16,63 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -16,75 +16,63 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan. On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
""" """
def __init__(self, models_parameters): def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=models_parameters.seed, n_jobs=-1) random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property @property
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): def fit(self, X_train, y_train, X_val, y_val):
self._regressor.fit(X_train, y_train) self._estimator.fit(X_train, y_train)
predictions = list() predictions = list()
for tree in self._regressor.estimators_: for tree in self._estimator.estimators_:
predictions.append(tree.predict(X_train)) predictions.append(tree.predict(X_train))
predictions = np.array(predictions) predictions = np.array(predictions)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions) kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_) labels = np.array(kmeans.labels_)
# for each cluster select the best tree on the validation set # For each cluster select the best tree on the validation set
"""
pruned_forest = list()
for c in range(self._extracted_forest_size):
index = np.where(labels == c)[0]
cluster = list()
for i in index:
y_val_pred = self._regressor.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster.append(tree_pred)
best_tree_index = np.argmax(cluster)
pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])"""
extracted_forest_sizes = list(range(self._extracted_forest_size)) extracted_forest_sizes = list(range(self._extracted_forest_size))
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=False)) as prune_forest_job_pb: with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb, pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, score_metric) extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
for i in range(self._extracted_forest_size)) for i in range(self._extracted_forest_size))
self._regressor.estimators_ = pruned_forest self._estimator.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric): def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0] index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=False)) as cluster_job_pb: with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val, cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index))) y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster) best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update() prune_forest_job_pb.update()
return self._regressor.estimators_[index[best_tree_index]] return self._estimator.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric): def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._regressor.estimators_[i].predict(X_val) y_val_pred = self._estimator.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred) tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update() cluster_job_pb.update()
return tree_pred return tree_pred
def predict(self, X): def predict(self, X):
return self._regressor.predict(X) return self._estimator.predict(X)
def score(self, X, y): def score(self, X, y):
predictions = list() predictions = list()
for tree in self._regressor.estimators_: for tree in self._estimator.estimators_:
predictions.append(tree.predict(X)) predictions.append(tree.predict(X))
predictions = np.array(predictions) predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0) mean_predictions = np.mean(predictions, axis=0)
score = mean_squared_error(mean_predictions, y) score = self._score_metric(mean_predictions, y)
return score return score
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
return self._regressor.predict(X) return self._estimator.predict(X)
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta from abc import abstractmethod, ABCMeta
import numpy as np import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
...@@ -10,24 +14,25 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -10,24 +14,25 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
""" """
def __init__(self, models_parameters): def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=models_parameters.seed) random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property @property
def models_parameters(self): def models_parameters(self):
return self._models_parameters return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): def fit(self, X_train, y_train, X_val, y_val):
self._regressor.fit(X_train, y_train) self._estimator.fit(X_train, y_train)
y_val_pred = self._regressor.predict(X_val) y_val_pred = self._estimator.predict(X_val)
forest_pred = score_metric(y_val, y_val_pred) forest_pred = self._score_metric(y_val, y_val_pred)
forest = self._regressor.estimators_ forest = self._estimator.estimators_
selected_trees = list() selected_trees = list()
tree_list = list(self._regressor.estimators_) tree_list = list(self._estimator.estimators_)
for _ in range(self._extracted_forest_size): for _ in range(self._extracted_forest_size):
best_similarity = 100000 best_similarity = 100000
...@@ -35,13 +40,14 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -35,13 +40,14 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
for i in range(len(tree_list)): for i in range(len(tree_list)):
lonely_tree = tree_list[i] lonely_tree = tree_list[i]
del tree_list[i] del tree_list[i]
val_list = list() begin_time = time.time()
for tree in tree_list: with tqdm_joblib(tqdm(total=len(tree_list), disable=True)) as job_pb:
val_pred = tree.predict(X_val) val_list = Parallel(n_jobs=-1)(delayed(self._tree_predict_job)(
val_list.append(val_pred) job_pb, tree_list[i], X_val)
for i in range(len(tree_list)))
val_list = np.array(val_list) val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0) val_mean = np.mean(val_list, axis=0)
val_score = score_metric(val_mean, y_val) val_score = self._score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score) temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity): if (temp_similarity < best_similarity):
found_index = i found_index = i
...@@ -51,17 +57,21 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -51,17 +57,21 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
del tree_list[found_index] del tree_list[found_index]
pruned_forest = list(set(forest) - set(selected_trees)) pruned_forest = list(set(forest) - set(selected_trees))
self._regressor.estimators_ = pruned_forest self._estimator.estimators_ = pruned_forest
def _tree_predict_job(self, job_pb, tree, X_val):
val_pred = tree.predict(X_val)
return val_pred
def score(self, X, y): def score(self, X, y):
test_list = list() test_list = list()
for mod in self._regressor.estimators_: for mod in self._estimator.estimators_:
test_pred = mod.predict(X) test_pred = mod.predict(X)
test_list.append(test_pred) test_list.append(test_pred)
test_list = np.array(test_list) test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0) test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y) score = self._score_metric(test_mean, y)
return score return score
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
return self._regressor.predict(X) return self._estimator.predict(X)
...@@ -412,7 +412,7 @@ if __name__ == "__main__": ...@@ -412,7 +412,7 @@ if __name__ == "__main__":
raise ValueError('Score metrics of all experiments must be the same.') raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4') output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses( Plotter.plot_stage2_losses(
......
...@@ -167,6 +167,7 @@ if __name__ == "__main__": ...@@ -167,6 +167,7 @@ if __name__ == "__main__":
DEFAULT_SKIP_BEST_HYPERPARAMS = False DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1 DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp' DEFAULT_EXTRACTION_STRATEGY = 'omp'
DEFAULT_OVERWRITE = False
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
...@@ -193,6 +194,7 @@ if __name__ == "__main__": ...@@ -193,6 +194,7 @@ if __name__ == "__main__":
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.') parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args() args = parser.parse_args()
if args.experiment_configuration: if args.experiment_configuration:
...@@ -249,7 +251,8 @@ if __name__ == "__main__": ...@@ -249,7 +251,8 @@ if __name__ == "__main__":
if args.experiment_id: if args.experiment_id:
experiment_id = args.experiment_id experiment_id = args.experiment_id
#shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True) if args.overwrite:
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else: else:
# Resolve the next experiment id number (last id + 1) # Resolve the next experiment id number (last id + 1)
experiment_id = resolve_experiment_id(parameters['models_dir']) experiment_id = resolve_experiment_id(parameters['models_dir'])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment