Commit 0a97ff64 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Fix parallelization and estimator default hyperparams in kmeans and similarity...

Fix parallelization and estimator default hyperparams in kmeans and similarity methods. Fix on resume mode in train.py. Fix stage5 saving (tmp) in compute_results.py
parent be5bc24a
......@@ -16,75 +16,63 @@ class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters):
def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed, n_jobs=-1)
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
self._regressor.fit(X_train, y_train)
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
predictions = list()
for tree in self._regressor.estimators_:
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X_train))
predictions = np.array(predictions)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_)
# for each cluster select the best tree on the validation set
"""
pruned_forest = list()
for c in range(self._extracted_forest_size):
index = np.where(labels == c)[0]
cluster = list()
for i in index:
y_val_pred = self._regressor.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster.append(tree_pred)
best_tree_index = np.argmax(cluster)
pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])"""
# For each cluster select the best tree on the validation set
extracted_forest_sizes = list(range(self._extracted_forest_size))
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=False)) as prune_forest_job_pb:
with tqdm_joblib(tqdm(total=self._extracted_forest_size, disable=True)) as prune_forest_job_pb:
pruned_forest = Parallel(n_jobs=-1)(delayed(self._prune_forest_job)(prune_forest_job_pb,
extracted_forest_sizes[i], labels, X_val, y_val, score_metric)
extracted_forest_sizes[i], labels, X_val, y_val, self._score_metric)
for i in range(self._extracted_forest_size))
self._regressor.estimators_ = pruned_forest
self._estimator.estimators_ = pruned_forest
def _prune_forest_job(self, prune_forest_job_pb, c, labels, X_val, y_val, score_metric):
index = np.where(labels == c)[0]
with tqdm_joblib(tqdm(total=len(index), disable=False)) as cluster_job_pb:
with tqdm_joblib(tqdm(total=len(index), disable=True)) as cluster_job_pb:
cluster = Parallel(n_jobs=-1)(delayed(self._cluster_job)(cluster_job_pb, index[i], X_val,
y_val, score_metric) for i in range(len(index)))
best_tree_index = np.argmax(cluster)
prune_forest_job_pb.update()
return self._regressor.estimators_[index[best_tree_index]]
return self._estimator.estimators_[index[best_tree_index]]
def _cluster_job(self, cluster_job_pb, i, X_val, y_val, score_metric):
y_val_pred = self._regressor.estimators_[i].predict(X_val)
y_val_pred = self._estimator.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster_job_pb.update()
return tree_pred
def predict(self, X):
return self._regressor.predict(X)
return self._estimator.predict(X)
def score(self, X, y):
predictions = list()
for tree in self._regressor.estimators_:
for tree in self._estimator.estimators_:
predictions.append(tree.predict(X))
predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0)
score = mean_squared_error(mean_predictions, y)
score = self._score_metric(mean_predictions, y)
return score
def predict_base_estimator(self, X):
return self._regressor.predict(X)
return self._estimator.predict(X)
from bolsonaro.utils import tqdm_joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
......@@ -10,24 +14,25 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
"""
def __init__(self, models_parameters):
def __init__(self, models_parameters, score_metric=mean_squared_error):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
self._estimator = RandomForestRegressor(**self._models_parameters.hyperparameters,
random_state=self._models_parameters.seed, n_jobs=-1)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
self._score_metric = score_metric
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
self._regressor.fit(X_train, y_train)
def fit(self, X_train, y_train, X_val, y_val):
self._estimator.fit(X_train, y_train)
y_val_pred = self._regressor.predict(X_val)
forest_pred = score_metric(y_val, y_val_pred)
forest = self._regressor.estimators_
y_val_pred = self._estimator.predict(X_val)
forest_pred = self._score_metric(y_val, y_val_pred)
forest = self._estimator.estimators_
selected_trees = list()
tree_list = list(self._regressor.estimators_)
tree_list = list(self._estimator.estimators_)
for _ in range(self._extracted_forest_size):
best_similarity = 100000
......@@ -35,13 +40,14 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
for i in range(len(tree_list)):
lonely_tree = tree_list[i]
del tree_list[i]
val_list = list()
for tree in tree_list:
val_pred = tree.predict(X_val)
val_list.append(val_pred)
begin_time = time.time()
with tqdm_joblib(tqdm(total=len(tree_list), disable=True)) as job_pb:
val_list = Parallel(n_jobs=-1)(delayed(self._tree_predict_job)(
job_pb, tree_list[i], X_val)
for i in range(len(tree_list)))
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
val_score = score_metric(val_mean, y_val)
val_score = self._score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity):
found_index = i
......@@ -51,17 +57,21 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
del tree_list[found_index]
pruned_forest = list(set(forest) - set(selected_trees))
self._regressor.estimators_ = pruned_forest
self._estimator.estimators_ = pruned_forest
def _tree_predict_job(self, job_pb, tree, X_val):
val_pred = tree.predict(X_val)
return val_pred
def score(self, X, y):
test_list = list()
for mod in self._regressor.estimators_:
for mod in self._estimator.estimators_:
test_pred = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y)
score = self._score_metric(test_mean, y)
return score
def predict_base_estimator(self, X):
return self._regressor.predict(X)
return self._estimator.predict(X)
......@@ -412,7 +412,7 @@ if __name__ == "__main__":
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4')
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
......
......@@ -167,6 +167,7 @@ if __name__ == "__main__":
DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp'
DEFAULT_OVERWRITE = False
begin_random_seed_range = 1
end_random_seed_range = 2000
......@@ -193,6 +194,7 @@ if __name__ == "__main__":
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args()
if args.experiment_configuration:
......@@ -249,7 +251,8 @@ if __name__ == "__main__":
if args.experiment_id:
experiment_id = args.experiment_id
#shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
if args.overwrite:
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else:
# Resolve the next experiment id number (last id + 1)
experiment_id = resolve_experiment_id(parameters['models_dir'])
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment