diff --git a/code/bolsonaro/models/ensemble_selection_forest_regressor.py b/code/bolsonaro/models/ensemble_selection_forest_regressor.py index 1e63c8cf3c34424e277d5a79d6f2349aee2aa5a5..b82e131d296392963e31d85f5c1444fc5cb7fd09 100644 --- a/code/bolsonaro/models/ensemble_selection_forest_regressor.py +++ b/code/bolsonaro/models/ensemble_selection_forest_regressor.py @@ -53,14 +53,14 @@ class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): def score(self, X, y): predictions = self._predict_base_estimator(X) - mean_predictions = np.mean(predictions, axis=0) - return self._score_metric(mean_predictions, y) + return self._score_metric(predictions, y) def predict_base_estimator(self, X): predictions = list() for tree in self._ensemble_selected: predictions.append(tree.predict(X)) - return np.array(predictions) + mean_predictions = np.mean(np.array(predictions), axis=0) + return mean_predictions @staticmethod def generate_library(X_train, y_train, random_state=None): diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index eb3e8b50d7411a2beee8e79bf7da46f6561558a2..335816b1dd33d28175f4865da2fddbbf73b8027d 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -22,10 +22,10 @@ class ModelFactory(object): if model_parameters.extraction_strategy == 'omp': return OmpForestBinaryClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': - return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'none': - return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') @@ -33,7 +33,7 @@ class ModelFactory(object): if model_parameters.extraction_strategy == 'omp': return OmpForestRegressor(model_parameters) elif model_parameters.extraction_strategy == 'random': - return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size, + return RandomForestRegressor(**model_parameters.hyperparameters, random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'similarity': return SimilarityForestRegressor(model_parameters) @@ -42,7 +42,7 @@ class ModelFactory(object): elif model_parameters.extraction_strategy == 'ensemble': return EnsembleSelectionForestRegressor(model_parameters, library=library) elif model_parameters.extraction_strategy == 'none': - return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], + return RandomForestRegressor(**model_parameters.hyperparameters, random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') @@ -50,10 +50,10 @@ class ModelFactory(object): if model_parameters.extraction_strategy == 'omp': return OmpForestMulticlassClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': - return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'none': - return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index 5b947d327693020b51c7da778d4855274454de93..e4a5667b0d759fe1344fe14bb89dcf601c14f610 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -145,5 +145,4 @@ class SingleOmpForest(OmpForest): omp_trees_indices = np.nonzero(weights)[0] select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0) - print(len(omp_trees_indices)) return select_trees diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 6ec39094fccdac553a09adbddebda27d621cd4e8..ebcfe80440b97510c02e948ffc1e72ca752ca626 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor +from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.data.task import Task from . import LOG_PATH @@ -73,7 +74,7 @@ class Trainer(object): else: raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) - def train(self, model): + def train(self, model, extracted_forest_size=None): """ :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier. @@ -83,10 +84,13 @@ class Trainer(object): self._logger.debug('Training model using train set...') self._begin_time = time.time() if type(model) in [RandomForestRegressor, RandomForestClassifier]: - model.fit( - X=self._X_forest, - y=self._y_forest - ) + if extracted_forest_size is not None: + model.estimators_ = np.random.choice(model.estimators_, extracted_forest_size) + else: + model.fit( + X=self._X_forest, + y=self._y_forest + ) else: model.fit( self._X_forest, @@ -112,12 +116,12 @@ class Trainer(object): y_pred = np.sign(y_pred) y_pred = np.where(y_pred == 0, 1, y_pred) result = self._classification_score_metric(y_true, y_pred) - elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor]: + elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: result = model.score(X, y_true) return result def __score_func_base(self, model, X, y_true): - if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor]: + if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: y_pred = model.predict_base_estimator(X) result = self._base_regression_score_metric(y_true, y_pred) elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: diff --git a/code/train.py b/code/train.py index b7ea2a77ab08cac3e544c12e2d0c3036ae83f379..72c91d8b004bdf0205b5c89856cae4e388c6a225 100644 --- a/code/train.py +++ b/code/train.py @@ -59,10 +59,26 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb else: library = None + if parameters['extraction_strategy'] == 'random': + pretrained_model_parameters = ModelParameters( + extracted_forest_size=parameters['forest_size'], + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters, + extraction_strategy=parameters['extraction_strategy'] + ) + pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters, library=library) + else: + pretrained_estimator = None + pretrained_model_parameters = None + if parameters['extraction_strategy'] != 'none': with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i], - models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library) + models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, + pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters) for i in range(len(parameters['extracted_forest_size']))) else: forest_size = hyperparameters['n_estimators'] @@ -103,7 +119,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb seed_job_pb.update(1) def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir, - seed, parameters, dataset, hyperparameters, experiment_id, trainer, library): + seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, + pretrained_estimator=None, pretrained_model_parameters=None): logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format( seed, extracted_forest_size, threading.get_ident())) @@ -127,21 +144,24 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) - model_parameters = ModelParameters( - extracted_forest_size=extracted_forest_size, - normalize_D=parameters['normalize_D'], - subsets_used=parameters['subsets_used'], - normalize_weights=parameters['normalize_weights'], - seed=seed, - hyperparameters=hyperparameters, - extraction_strategy=parameters['extraction_strategy'] - ) - model_parameters.save(sub_models_dir, experiment_id) - - model = ModelFactory.build(dataset.task, model_parameters, library=library) + if not pretrained_estimator: + model_parameters = ModelParameters( + extracted_forest_size=extracted_forest_size, + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters, + extraction_strategy=parameters['extraction_strategy'] + ) + model_parameters.save(sub_models_dir, experiment_id) + model = ModelFactory.build(dataset.task, model_parameters, library=library) + else: + model = pretrained_estimator + pretrained_model_parameters.save(sub_models_dir, experiment_id) trainer.init(model, subsets_used=parameters['subsets_used']) - trainer.train(model) + trainer.train(model, extracted_forest_size=extracted_forest_size) trainer.compute_results(model, sub_models_dir) """