Skip to content
Snippets Groups Projects
Commit c80ddd61 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Add Paolo's first implementation of this paper:...

Add Paolo's first implementation of this paper: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
parent 21ccc627
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.data.task import Task
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import os import os
...@@ -30,6 +31,8 @@ class ModelFactory(object): ...@@ -30,6 +31,8 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size, return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters)
else: else:
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed) random_state=model_parameters.seed)
......
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
from abc import abstractmethod, ABCMeta
import numpy as np
class SimilarityForestRegressor(BaseEstimator, metaclass=ABCMeta):
"""
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2822360/
"""
def __init__(self, models_parameters):
self._models_parameters = models_parameters
self._regressor = RandomForestRegressor(n_estimators=self._models_parameters.hyperparameters['n_estimators'],
random_state=models_parameters.seed)
self._extracted_forest_size = self._models_parameters.extracted_forest_size
@property
def models_parameters(self):
return self._models_parameters
def fit(self, X_train, y_train, X_val, y_val, score_metric=mean_squared_error):
self._regressor.fit(X_train, y_train)
y_val_pred = self._regressor.predict(X_val)
forest_pred = score_metric(y_val, y_val_pred)
forest = self._regressor.estimators_
selected_trees = list()
tree_list = list(self._regressor.estimators_)
for _ in range(self._extracted_forest_size):
best_similarity = 100000
found_index = 0
for i in range(len(tree_list)):
lonely_tree = tree_list[i]
del tree_list[i]
val_list = list()
for tree in tree_list:
val_pred = tree.predict(X_val)
val_list.append(val_pred)
val_list = np.array(val_list)
val_mean = np.mean(val_list, axis=0)
val_score = score_metric(val_mean, y_val)
temp_similarity = abs(forest_pred - val_score)
if (temp_similarity < best_similarity):
found_index = i
best_similarity = temp_similarity
tree_list.insert(i, lonely_tree)
selected_trees.append(tree_list[found_index])
del tree_list[found_index]
pruned_forest = list(set(forest) - set(selected_trees))
self._regressor.estimators_ = pruned_forest
def score(self, X, y):
test_list = list()
for mod in self._regressor.estimators_:
test_pred = mod.predict(X)
test_list.append(test_pred)
test_list = np.array(test_list)
test_mean = np.mean(test_list, axis=0)
score = mean_squared_error(test_mean, y)
return score
from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from . import LOG_PATH from . import LOG_PATH
...@@ -87,15 +88,15 @@ class Trainer(object): ...@@ -87,15 +88,15 @@ class Trainer(object):
) )
else: else:
model.fit( model.fit(
X_forest=self._X_forest, self._X_forest,
y_forest=self._y_forest, self._y_forest,
X_omp=self._X_omp, self._X_omp,
y_omp=self._y_omp self._y_omp
) )
self._end_time = time.time() self._end_time = time.time()
def __score_func(self, model, X, y_true): def __score_func(self, model, X, y_true):
if type(model) in [OmpForestRegressor, RandomForestRegressor]: if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
y_pred = model.predict(X) y_pred = model.predict(X)
result = self._regression_score_metric(y_true, y_pred) result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
...@@ -115,7 +116,7 @@ class Trainer(object): ...@@ -115,7 +116,7 @@ class Trainer(object):
elif type(model) == RandomForestClassifier: elif type(model) == RandomForestClassifier:
y_pred = model.predict(X) y_pred = model.predict(X)
result = self._base_classification_score_metric(y_true, y_pred) result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestRegressor: elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
y_pred = model.predict(X) y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
return result return result
......
...@@ -163,7 +163,7 @@ if __name__ == "__main__": ...@@ -163,7 +163,7 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.') parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.')
args = parser.parse_args() args = parser.parse_args()
if args.experiment_configuration: if args.experiment_configuration:
...@@ -173,7 +173,7 @@ if __name__ == "__main__": ...@@ -173,7 +173,7 @@ if __name__ == "__main__":
else: else:
parameters = args.__dict__ parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none']: if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
......
results/california_housing/stage4/losses_2.png

44.7 KiB

results/california_housing/stage4_backup/losses_2.png

44.7 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment