diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index 262d2560054ba4177852d883cafd48eaccbe475d..74993cc0a30b754595a490de40d69e064687bc24 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -1,7 +1,8 @@ from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_regressor import OmpForestRegressor -from bolsonaro.data.task import Task from bolsonaro.models.model_parameters import ModelParameters +from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.data.task import Task from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier import os @@ -30,6 +31,8 @@ class ModelFactory(object): elif model_parameters.extraction_strategy == 'random': return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size, random_state=model_parameters.seed) + elif model_parameters.extraction_strategy == 'similarity': + return SimilarityForestRegressor(model_parameters) else: return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) diff --git a/code/bolsonaro/models/similarity_forest_regressor.py b/code/bolsonaro/models/similarity_forest_regressor.py new file mode 100644 index 0000000000000000000000000000000000000000..f8d9c3ed349cf8c9e27acbcd7982694a65e11636 --- /dev/null +++ b/code/bolsonaro/models/similarity_forest_regressor.py @@ -0,0 +1,65 @@ +from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import mean_squared_error +from sklearn.base import BaseEstimator +from abc import abstractmethod, ABCMeta +import numpy as np + + +class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta): + """ + https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/ + """ + + def __init__(self, models_parameters): + self._models_parameters = models_parameters + self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'], + random_state=models_parameters.seed) + self._extracted_forest_size = self._models_parameters.extracted_forest_size + + @property + def models_parameters(self): + return self._models_parameters + + def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error): + + self._regressor.fit(X_train, y_train) + + y_val_pred = self._regressor.predict(X_val) + forest_pred = score_metric(y_val, y_val_pred) + forest = self._regressor.estimators_ + selected_trees = list() + tree_list = list(self._regressor.estimators_) + + for _ in range(self._extracted_forest_size): + best_similarity = 100000 + found_index = 0 + for i in range(len(tree_list)): + lonely_tree = tree_list[i] + del tree_list[i] + val_list = list() + for tree in tree_list: + val_pred = tree.predict(X_val) + val_list.append(val_pred) + val_list = np.array(val_list) + val_mean = np.mean(val_list, axis=0) + val_score = score_metric(val_mean, y_val) + temp_similarity = abs(forest_pred - val_score) + if (temp_similarity < best_similarity): + found_index = i + best_similarity = temp_similarity + tree_list.insert(i, lonely_tree) + selected_trees.append(tree_list[found_index]) + del tree_list[found_index] + + pruned_forest = list(set(forest) - set(selected_trees)) + self._regressor.estimators_ = pruned_forest + + def score(self, X, y): + test_list = list() + for mod in self._regressor.estimators_: + test_pred = mod.predict(X) + test_list.append(test_pred) + test_list = np.array(test_list) + test_mean = np.mean(test_list, axis=0) + score = mean_squared_error(test_mean, y) + return score diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 9fea5053f83a774026ac69c5ed7da47a6a36a296..ce233d56c5242166a852922fa5ef3c0ab4ac3f31 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -1,6 +1,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier +from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.data.task import Task from . import LOG_PATH @@ -87,15 +88,15 @@ class Trainer(object): ) else: model.fit( - X_forest=self._X_forest, - y_forest=self._y_forest, - X_omp=self._X_omp, - y_omp=self._y_omp + self._X_forest, + self._y_forest, + self._X_omp, + self._y_omp ) self._end_time = time.time() def __score_func(self, model, X, y_true): - if type(model) in [OmpForestRegressor, RandomForestRegressor]: + if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: y_pred = model.predict(X) result = self._regression_score_metric(y_true, y_pred) elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: @@ -115,7 +116,7 @@ class Trainer(object): elif type(model) == RandomForestClassifier: y_pred = model.predict(X) result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) == RandomForestRegressor: + elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: y_pred = model.predict(X) result = self._base_regression_score_metric(y_true, y_pred) return result diff --git a/code/train.py b/code/train.py index 1131f2bf390f545385654ae59aea65a54e3f9977..e51514cc254ee564993243a676b05d07e3aa7597 100644 --- a/code/train.py +++ b/code/train.py @@ -163,7 +163,7 @@ if __name__ == "__main__": parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') - parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.') + parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.') args = parser.parse_args() if args.experiment_configuration: @@ -173,7 +173,7 @@ if __name__ == "__main__": else: parameters = args.__dict__ - if parameters['extraction_strategy'] not in ['omp', 'random', 'none']: + if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']: raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) diff --git a/results/california_housing/stage4/losses_2.png b/results/california_housing/stage4/losses_2.png new file mode 100644 index 0000000000000000000000000000000000000000..5562fd0076c01cf38e93936a22d69c9e36c53fc5 Binary files /dev/null and b/results/california_housing/stage4/losses_2.png differ diff --git a/results/california_housing/stage4_backup/losses_2.png b/results/california_housing/stage4_backup/losses_2.png new file mode 100644 index 0000000000000000000000000000000000000000..5562fd0076c01cf38e93936a22d69c9e36c53fc5 Binary files /dev/null and b/results/california_housing/stage4_backup/losses_2.png differ