Skip to content
Snippets Groups Projects
Commit 59e65276 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Finish to add similarity method to the pipeline. Add kmeans pruning method

parent 28c3d874
No related branches found
No related tags found
1 merge request!12Resolve "integration-sota"
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from sklearn.cluster import KMeans
from abc import abstractmethod, ABCMeta
import numpy as np
from scipy.stats import mode
class KMeansForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
On extreme pruning of random forest ensembles for ral-time predictive applications', by Khaled Fawagreh, Mohamed Medhat Gaber and Eyad Elyan.
"""
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
self._regressor.fit(X_train, y_train)
predictions = list()
for tree in self._regressor.estimators_:
predictions.append(tree.predict(X_train))
predictions = np.array(predictions)
kmeans = KMeans(n_clusters=self._extracted_forest_size, random_state=self._models_parameters.seed).fit(predictions)
labels = np.array(kmeans.labels_)
# for each cluster select the best tree on the validation set
pruned_forest = list()
for c in range(self._extracted_forest_size):
index = np.where(labels == c)[0]
cluster = list()
for i in index:
y_val_pred = self._regressor.estimators_[i].predict(X_val)
tree_pred = score_metric(y_val, y_val_pred)
cluster.append(tree_pred)
best_tree_index = np.argmax(cluster)
pruned_forest.append(self._regressor.estimators_[index[best_tree_index]])
self._regressor.estimators_ = pruned_forest
def predict(self, X):
return self._regressor.predict(X)
def score(self, X, y):
predictions = list()
for tree in self._regressor.estimators_:
predictions.append(tree.predict(X))
predictions = np.array(predictions)
mean_predictions = np.mean(predictions, axis=0)
score = mean_squared_error(mean_predictions, y)
return score
def predict_base_estimator(self, X):
return self._regressor.predict(X)
......@@ -2,6 +2,7 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, Om
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
......@@ -22,9 +23,11 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.REGRESSION:
if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters)
......@@ -33,15 +36,21 @@ class ModelFactory(object):
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters)
else:
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
elif task == Task.MULTICLASSIFICATION:
if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
else:
raise ValueError('Invalid extraction strategy')
......@@ -21,7 +21,6 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
self._regressor.fit(X_train, y_train)
y_val_pred = self._regressor.predict(X_val)
......@@ -63,3 +62,6 @@ class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y)
return score
def predict_base_estimator(self, X):
return self._regressor.predict(X)
......@@ -2,6 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task
from . import LOG_PATH
......@@ -96,7 +97,7 @@ class Trainer(object):
self._end_time = time.time()
def __score_func(self, model, X, y_true):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
if type(model) in [OmpForestRegressor, RandomForestRegressor]:
y_pred = model.predict(X)
result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
......@@ -104,10 +105,12 @@ class Trainer(object):
if type(model) is OmpForestBinaryClassifier:
y_pred = y_pred.round()
result = self._classification_score_metric(y_true, y_pred)
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor]:
result = model.score(X, y_true)
return result
def __score_func_base(self, model, X, y_true):
if type(model) == OmpForestRegressor:
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor]:
y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
......@@ -116,7 +119,7 @@ class Trainer(object):
elif type(model) == RandomForestClassifier:
y_pred = model.predict(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
elif type(model) is RandomForestRegressor:
y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred)
return result
......
......@@ -380,20 +380,51 @@ if __name__ == "__main__":
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
elif args.stage == 5:
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
# omp_with_params
logger.info('Loading kmeans_with_params experiment scores...')
kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \
kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[3])
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric
== omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
kmeans_with_params_test_scores],
all_labels=['base', 'random', 'omp', 'kmeans'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
else:
raise ValueError('This stage number is not supported yet, but it will be!')
logger.info('Done.')
"""
TODO:
For each dataset:
Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 4) A figure to finally compare the perf of our approach using the previous selected
parameters vs the baseline vs other papers using different extracted forest size
(percentage of the tree size found previously in best hyperparams search) on the abscissa.
IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1).
"""
......@@ -163,7 +163,7 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.')
args = parser.parse_args()
if args.experiment_configuration:
......@@ -173,7 +173,7 @@ if __name__ == "__main__":
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']:
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment