Commit 2a265135 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Fix hyperparams bugs in base and random. Fix extracted forest size used in...

Fix hyperparams bugs in base and random. Fix extracted forest size used in random. Factorize random fitting
parent 86c4cf10
...@@ -53,14 +53,14 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): ...@@ -53,14 +53,14 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta):
def score(self, X, y): def score(self, X, y):
predictions = self._predict_base_estimator(X) predictions = self._predict_base_estimator(X)
mean_predictions = np.mean(predictions, axis=0) return self._score_metric(predictions, y)
return self._score_metric(mean_predictions, y)
def predict_base_estimator(self, X): def predict_base_estimator(self, X):
predictions = list() predictions = list()
for tree in self._ensemble_selected: for tree in self._ensemble_selected:
predictions.append(tree.predict(X)) predictions.append(tree.predict(X))
return np.array(predictions) mean_predictions = np.mean(np.array(predictions), axis=0)
return mean_predictions
@staticmethod @staticmethod
def generate_library(X_train, y_train, random_state=None): def generate_library(X_train, y_train, random_state=None):
......
...@@ -22,10 +22,10 @@ class ModelFactory(object): ...@@ -22,10 +22,10 @@ class ModelFactory(object):
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy == 'omp':
return OmpForestBinaryClassifier(model_parameters) return OmpForestBinaryClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'none': elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
...@@ -33,7 +33,7 @@ class ModelFactory(object): ...@@ -33,7 +33,7 @@ class ModelFactory(object):
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters) return OmpForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity': elif model_parameters.extraction_strategy == 'similarity':
return SimilarityForestRegressor(model_parameters) return SimilarityForestRegressor(model_parameters)
...@@ -42,7 +42,7 @@ class ModelFactory(object): ...@@ -42,7 +42,7 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'ensemble': elif model_parameters.extraction_strategy == 'ensemble':
return EnsembleSelectionForestRegressor(model_parameters, library=library) return EnsembleSelectionForestRegressor(model_parameters, library=library)
elif model_parameters.extraction_strategy == 'none': elif model_parameters.extraction_strategy == 'none':
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
...@@ -50,10 +50,10 @@ class ModelFactory(object): ...@@ -50,10 +50,10 @@ class ModelFactory(object):
if model_parameters.extraction_strategy == 'omp': if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters) return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'none': elif model_parameters.extraction_strategy == 'none':
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], return RandomForestClassifier(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
...@@ -145,5 +145,4 @@ class SingleOmpForest(OmpForest): ...@@ -145,5 +145,4 @@ class SingleOmpForest(OmpForest):
omp_trees_indices = np.nonzero(weights)[0] omp_trees_indices = np.nonzero(weights)[0]
select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0) select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
print(len(omp_trees_indices))
return select_trees return select_trees
...@@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor ...@@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from . import LOG_PATH from . import LOG_PATH
...@@ -73,7 +74,7 @@ class Trainer(object): ...@@ -73,7 +74,7 @@ class Trainer(object):
else: else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
def train(self, model): def train(self, model, extracted_forest_size=None):
""" """
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier. OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
...@@ -83,10 +84,13 @@ class Trainer(object): ...@@ -83,10 +84,13 @@ class Trainer(object):
self._logger.debug('Training model using train set...') self._logger.debug('Training model using train set...')
self._begin_time = time.time() self._begin_time = time.time()
if type(model) in [RandomForestRegressor, RandomForestClassifier]: if type(model) in [RandomForestRegressor, RandomForestClassifier]:
model.fit( if extracted_forest_size is not None:
X=self._X_forest, model.estimators_ = np.random.choice(model.estimators_, extracted_forest_size)
y=self._y_forest else:
) model.fit(
X=self._X_forest,
y=self._y_forest
)
else: else:
model.fit( model.fit(
self._X_forest, self._X_forest,
...@@ -112,12 +116,12 @@ class Trainer(object): ...@@ -112,12 +116,12 @@ class Trainer(object):
y_pred = np.sign(y_pred) y_pred = np.sign(y_pred)
y_pred = np.where(y_pred == 0, 1, y_pred) y_pred = np.where(y_pred == 0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred) result = self._classification_score_metric(y_true, y_pred)
elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor]: elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
result = model.score(X, y_true) result = model.score(X, y_true)
return result return result
def __score_func_base(self, model, X, y_true): def __score_func_base(self, model, X, y_true):
if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor]: if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]:
y_pred = model.predict_base_estimator(X) y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred) result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
......
...@@ -59,10 +59,26 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -59,10 +59,26 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
else: else:
library = None library = None
if parameters['extraction_strategy'] == 'random':
pretrained_model_parameters = ModelParameters(
extracted_forest_size=parameters['forest_size'],
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters, library=library)
else:
pretrained_estimator = None
pretrained_model_parameters = None
if parameters['extraction_strategy'] != 'none': if parameters['extraction_strategy'] != 'none':
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i], Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library) models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library,
pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters)
for i in range(len(parameters['extracted_forest_size']))) for i in range(len(parameters['extracted_forest_size'])))
else: else:
forest_size = hyperparameters['n_estimators'] forest_size = hyperparameters['n_estimators']
...@@ -103,7 +119,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -103,7 +119,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
seed_job_pb.update(1) seed_job_pb.update(1)
def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir, def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
seed, parameters, dataset, hyperparameters, experiment_id, trainer, library): seed, parameters, dataset, hyperparameters, experiment_id, trainer, library,
pretrained_estimator=None, pretrained_model_parameters=None):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format( logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
seed, extracted_forest_size, threading.get_ident())) seed, extracted_forest_size, threading.get_ident()))
...@@ -127,21 +144,24 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz ...@@ -127,21 +144,24 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters( if not pretrained_estimator:
extracted_forest_size=extracted_forest_size, model_parameters = ModelParameters(
normalize_D=parameters['normalize_D'], extracted_forest_size=extracted_forest_size,
subsets_used=parameters['subsets_used'], normalize_D=parameters['normalize_D'],
normalize_weights=parameters['normalize_weights'], subsets_used=parameters['subsets_used'],
seed=seed, normalize_weights=parameters['normalize_weights'],
hyperparameters=hyperparameters, seed=seed,
extraction_strategy=parameters['extraction_strategy'] hyperparameters=hyperparameters,
) extraction_strategy=parameters['extraction_strategy']
model_parameters.save(sub_models_dir, experiment_id) )
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters, library=library) model = ModelFactory.build(dataset.task, model_parameters, library=library)
else:
model = pretrained_estimator
pretrained_model_parameters.save(sub_models_dir, experiment_id)
trainer.init(model, subsets_used=parameters['subsets_used']) trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model) trainer.train(model, extracted_forest_size=extracted_forest_size)
trainer.compute_results(model, sub_models_dir) trainer.compute_results(model, sub_models_dir)
""" """
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment