diff --git a/code/bolsonaro/models/model_raw_results.py b/code/bolsonaro/models/model_raw_results.py index e5037423be3c8f62e3c1b690c1bb26c7e12424c7..fcb4220896e89e3a000f1058e34316dd9073a883 100644 --- a/code/bolsonaro/models/model_raw_results.py +++ b/code/bolsonaro/models/model_raw_results.py @@ -6,12 +6,12 @@ import datetime class ModelRawResults(object): - def __init__(self, model_object, training_time, + def __init__(self, model_weights, training_time, datetime, train_score, dev_score, test_score, train_score_base, dev_score_base, test_score_base, score_metric, base_score_metric): - self._model_object = model_object + self._model_weights = model_weights self._training_time = training_time self._datetime = datetime self._train_score = train_score @@ -24,8 +24,8 @@ class ModelRawResults(object): self._base_score_metric = base_score_metric @property - def model_object(self): - return self.model_object + def model_weights(self): + return self.model_weights @property def training_time(self): @@ -68,10 +68,12 @@ class ModelRawResults(object): return self._base_score_metric def save(self, models_dir): + if not os.path.exists(models_dir): + os.mkdir(models_dir) save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle', self.__dict__) @staticmethod - def load(models_dir): + def load(models_dir): return load_obj_from_pickle(models_dir + os.sep + 'model_raw_results.pickle', ModelRawResults) diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index b5339f8b471cddbd4a653e42c3b6604757c95ed6..d0f726825e0b12055ff617e5f5af37e987e3a35a 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator class OmpForest(BaseEstimator, metaclass=ABCMeta): + def __init__(self, models_parameters, base_forest_estimator): self._base_forest_estimator = base_forest_estimator self._models_parameters = models_parameters @@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): return self._base_forest_estimator.score(X, y) def _base_estimator_predictions(self, X): - # We need to use predict_proba to get the probabilities of each class return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T @property @@ -96,6 +96,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta): pass class SingleOmpForest(OmpForest): + def __init__(self, models_parameters, base_forest_estimator): # fit_intercept shouldn't be set to False as the data isn't necessarily centered here # normalization is handled outsite OMP @@ -123,3 +124,24 @@ class SingleOmpForest(OmpForest): forest_predictions /= self._forest_norms return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights) + + def predict_no_weights(self, X): + """ + Apply the SingleOmpForest to X without using the weights. + + Make all the base tree predictions + + :param X: a Forest + :return: a np.array of the predictions of the entire forest + """ + forest_predictions = self._base_estimator_predictions(X).T + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + weights = self._omp.coef_ + omp_trees_indices = np.nonzero(weights) + + select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0) + + return select_trees diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index 270f115df362351e2b038ed2226c617c0544dd4a..36d12be6727c25fcc029c13b1a13490f24be1295 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -106,6 +106,36 @@ class OmpForestMulticlassClassifier(OmpForest): max_preds = np.argmax(preds, axis=1) return np.array(label_names)[max_preds] + def predict_no_weights(self, X): + """ + Apply the SingleOmpForest to X without using the weights. + + Make all the base tree predictions + + :param X: a Forest + :return: a np.array of the predictions of the entire forest + """ + + forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T + + if self._models_parameters.normalize_D: + forest_predictions /= self._forest_norms + + label_names = [] + preds = [] + num_class = 0 + for class_label, omp_class in self._dct_class_omp.items(): + weights = omp_class.coef_ + omp_trees_indices = np.nonzero(weights) + label_names.append(class_label) + atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1 + preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0)) + num_class += 1 + + preds = np.array(preds).T + max_preds = np.argmax(preds, axis=1) + return np.array(label_names)[max_preds] + def score(self, X, y, metric=DEFAULT_SCORE_METRIC): predictions = self.predict(X) diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 7c436d2e44fbc3e7d188028db3354016a057107d..f87b827abf7ed08a83bd207443a12d0b1217a9d9 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -1,153 +1,191 @@ -from bolsonaro.models.model_raw_results import ModelRawResults -from bolsonaro.models.omp_forest_regressor import OmpForestRegressor -from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier -from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor -from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor -from bolsonaro.error_handling.logger_factory import LoggerFactory -from bolsonaro.data.task import Task -from . import LOG_PATH - -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier -from sklearn.metrics import mean_squared_error, accuracy_score -import time -import datetime -import numpy as np - - -class Trainer(object): - """ - Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. - """ - - def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score, - base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score): - """ - - :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes - """ - self._dataset = dataset - self._logger = LoggerFactory.create(LOG_PATH, __name__) - self._regression_score_metric = regression_score_metric - self._classification_score_metric = classification_score_metric - self._base_regression_score_metric = base_regression_score_metric - self._base_classification_score_metric = base_classification_score_metric - self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ - else classification_score_metric.__name__ - self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ - else base_classification_score_metric.__name__ - - @property - def score_metric_name(self): - return self._score_metric_name - - @property - def base_score_metric_name(self): - return self._base_score_metric_name - - def init(self, model, subsets_used='train,dev'): - if type(model) in [RandomForestRegressor, RandomForestClassifier]: - if subsets_used == 'train,dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - else: - self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - self._logger.debug('Fitting the forest on train subset') - elif model.models_parameters.subsets_used == 'train,dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - self._X_omp = self._dataset.X_dev - self._y_omp = self._dataset.y_dev - self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') - elif model.models_parameters.subsets_used == 'train+dev,train+dev': - self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._X_omp = self._X_forest - self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - self._y_omp = self._y_forest - self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') - elif model.models_parameters.subsets_used == 'train,train+dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - else: - raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) - - def train(self, model): - """ - :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, - OmpForestBinaryClassifier, OmpForestMulticlassClassifier. - :return: - """ - - self._logger.debug('Training model using train set...') - self._begin_time = time.time() - if type(model) in [RandomForestRegressor, RandomForestClassifier]: - model.fit( - X=self._X_forest, - y=self._y_forest - ) - else: - model.fit( - self._X_forest, - self._y_forest, - self._X_omp, - self._y_omp - ) - self._end_time = time.time() - - def __score_func(self, model, X, y_true): - if type(model) in [OmpForestRegressor, RandomForestRegressor]: - y_pred = model.predict(X) - result = self._regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: - y_pred = model.predict(X) - if type(model) is OmpForestBinaryClassifier: - y_pred = y_pred.round() - result = self._classification_score_metric(y_true, y_pred) - elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor]: - result = model.score(X, y_true) - return result - - def __score_func_base(self, model, X, y_true): - if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor]: - y_pred = model.predict_base_estimator(X) - result = self._base_regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: - y_pred = model.predict_base_estimator(X) - result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) == RandomForestClassifier: - y_pred = model.predict(X) - result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) is RandomForestRegressor: - y_pred = model.predict(X) - result = self._base_regression_score_metric(y_true, y_pred) - return result - - def compute_results(self, model, models_dir): - """ - :param model: Object with - :param models_dir: Where the results will be saved - """ - results = ModelRawResults( - model_object='', - training_time=self._end_time - self._begin_time, - datetime=datetime.datetime.now(), - train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), - dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), - train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), - dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), - test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), - score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name - ) - results.save(models_dir) - self._logger.info("Base performance on test: {}".format(results.test_score_base)) - self._logger.info("Performance on test: {}".format(results.test_score)) - - self._logger.info("Base performance on train: {}".format(results.train_score_base)) - self._logger.info("Performance on train: {}".format(results.train_score)) - - self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) - self._logger.info("Performance on dev: {}".format(results.dev_score)) +from bolsonaro.models.model_raw_results import ModelRawResults +from bolsonaro.models.omp_forest_regressor import OmpForestRegressor +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier +from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.data.task import Task +from . import LOG_PATH + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.metrics import mean_squared_error, accuracy_score +import time +import datetime +import numpy as np + + +class Trainer(object): + """ + Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. + """ + + def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score, + base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score): + """ + + :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes + """ + self._dataset = dataset + self._logger = LoggerFactory.create(LOG_PATH, __name__) + self._regression_score_metric = regression_score_metric + self._classification_score_metric = classification_score_metric + self._base_regression_score_metric = base_regression_score_metric + self._base_classification_score_metric = base_classification_score_metric + self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ + else classification_score_metric.__name__ + self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ + else base_classification_score_metric.__name__ + + @property + def score_metric_name(self): + return self._score_metric_name + + @property + def base_score_metric_name(self): + return self._base_score_metric_name + + def init(self, model, subsets_used='train,dev'): + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + if subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + else: + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._logger.debug('Fitting the forest on train subset') + elif model.models_parameters.subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = self._dataset.X_dev + self._y_omp = self._dataset.y_dev + self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') + elif model.models_parameters.subsets_used == 'train+dev,train+dev': + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._X_omp = self._X_forest + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._y_omp = self._y_forest + self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') + elif model.models_parameters.subsets_used == 'train,train+dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + else: + raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) + + def train(self, model): + """ + :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, + OmpForestBinaryClassifier, OmpForestMulticlassClassifier. + :return: + """ + + self._logger.debug('Training model using train set...') + self._begin_time = time.time() + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + model.fit( + X=self._X_forest, + y=self._y_forest + ) + else: + model.fit( + self._X_forest, + self._y_forest, + self._X_omp, + self._y_omp + ) + self._end_time = time.time() + + def __score_func(self, model, X, y_true, weights=True): + if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: + if weights: + y_pred = model.predict(X) + else: + y_pred = model.predict_no_weights(X) + result = self._regression_score_metric(y_true, y_pred) + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: + if weights: + y_pred = model.predict(X) + else: + y_pred = model.predict_no_weights(X) + if type(model) is OmpForestBinaryClassifier: + y_pred = y_pred.round() + result = self._classification_score_metric(y_true, y_pred) + return result + + def __score_func_base(self, model, X, y_true): + if type(model) == OmpForestRegressor: + y_pred = model.predict_base_estimator(X) + result = self._base_regression_score_metric(y_true, y_pred) + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: + y_pred = model.predict_base_estimator(X) + result = self._base_classification_score_metric(y_true, y_pred) + elif type(model) == RandomForestClassifier: + y_pred = model.predict(X) + result = self._base_classification_score_metric(y_true, y_pred) + elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: + y_pred = model.predict(X) + result = self._base_regression_score_metric(y_true, y_pred) + return result + + def compute_results(self, model, models_dir): + """ + :param model: Object with + :param models_dir: Where the results will be saved + """ + + model_weights = '' + if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]: + model_weights = model._omp.coef_ + elif type(model) == OmpForestMulticlassClassifier: + model_weights = model._dct_class_omp + elif type(model) == OmpForestBinaryClassifier: + model_weights = model._omp + + results = ModelRawResults( + model_weights=model_weights, + training_time=self._end_time - self._begin_time, + datetime=datetime.datetime.now(), + train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), + dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), + train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), + dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), + test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), + score_metric=self._score_metric_name, + base_score_metric=self._base_score_metric_name + ) + results.save(models_dir) + self._logger.info("Base performance on test: {}".format(results.test_score_base)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train: {}".format(results.train_score_base)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) + + if type(model) not in [RandomForestRegressor, RandomForestClassifier]: + results = ModelRawResults( + model_weights='', + training_time=self._end_time - self._begin_time, + datetime=datetime.datetime.now(), + train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), + dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), + train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), + dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), + test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), + score_metric=self._score_metric_name, + base_score_metric=self._base_score_metric_name + ) + results.save(models_dir+'_no_weights') + self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) + + diff --git a/code/bolsonaro/visualization/plotter.py b/code/bolsonaro/visualization/plotter.py index 7d2cde23d24df4fb3f41cf5413b3769fc8d9e959..5a5f72ad9fade836dcfed3c2ef6f452653dcf3d1 100644 --- a/code/bolsonaro/visualization/plotter.py +++ b/code/bolsonaro/visualization/plotter.py @@ -109,16 +109,16 @@ class Plotter(object): fig, ax = plt.subplots() - n = len(all_experiment_scores) + nb_experiments = len(all_experiment_scores) """ Get as many different colors from the specified cmap (here nipy_spectral) as there are curve to plot. """ - colors = Plotter.get_colors_from_cmap(n) + colors = Plotter.get_colors_from_cmap(nb_experiments) - # For each curve to plot - for i in range(n): + # For each curve to plot + for i in range(nb_experiments): # Retreive the scores in a list for each seed experiment_scores = list(all_experiment_scores[i].values()) # Compute the mean and the std for the CI diff --git a/code/compute_results.py b/code/compute_results.py index 01d710e0c3a05f792ced181be62e13c9091171ac..5f7fac2c7718cf887d3d83a5b3a7eb9cdebfb9d9 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -17,7 +17,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id): extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' return len(os.listdir(extracted_forest_sizes_root_path)) -def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id): +def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True): experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds @@ -28,6 +28,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d experiment_train_scores = dict() experiment_dev_scores = dict() experiment_test_scores = dict() + experiment_weights = dict() all_extracted_forest_sizes = list() # Used to check if all losses were computed using the same metric (it should be the case) @@ -44,14 +45,19 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d experiment_train_scores[seed] = list() experiment_dev_scores[seed] = list() experiment_test_scores[seed] = list() + experiment_weights[seed] = list() # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path) + extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ] extracted_forest_sizes.sort(key=int) all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes))) for extracted_forest_size in extracted_forest_sizes: # models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size} - extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + if weights: + extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + else: + extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + '_no_weights' # Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file model_raw_results = ModelRawResults.load(extracted_forest_size_path) # Save the scores @@ -60,6 +66,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d experiment_test_scores[seed].append(model_raw_results.test_score) # Save the metric experiment_score_metrics.append(model_raw_results.score_metric) + # Save the weights + #experiment_weights[seed].append(model_raw_results.model_weights) # Sanity checks if len(set(experiment_score_metrics)) > 1: @@ -67,7 +75,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1: raise ValueError("The extracted forest sizes aren't the sames across seeds.") - return experiment_train_scores, experiment_dev_scores, experiment_test_scores, all_extracted_forest_sizes[0], experiment_score_metrics[0] + return experiment_train_scores, experiment_dev_scores, experiment_test_scores, \ + all_extracted_forest_sizes[0], experiment_score_metrics[0]#, experiment_weights def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number): experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} @@ -120,6 +129,7 @@ if __name__ == "__main__": DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' + DEFAULT_PLOT_WEIGHT_DENSITY = False parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].') @@ -130,6 +140,7 @@ if __name__ == "__main__": parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') + parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.') args = parser.parse_args() if args.stage not in list(range(1, 6)): @@ -347,9 +358,17 @@ if __name__ == "__main__": extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) # omp_with_params logger.info('Loading omp_with_params experiment scores...') + """omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ + omp_with_params_experiment_score_metric, experiment_weights = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, args.experiment_ids[2])""" omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( args.models_dir, args.results_dir, args.experiment_ids[2]) + #omp_with_params_without_weights + logger.info('Loading omp_with_params experiment scores...') + omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \ + omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, args.experiment_ids[2], weights=False) """# base_with_params logger.info('Loading base_with_params experiment scores 2...') @@ -369,13 +388,14 @@ if __name__ == "__main__": raise ValueError('Score metrics of all experiments must be the same.') experiments_score_metric = base_with_params_experiment_score_metric - output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4') + output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4_fix') pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) Plotter.plot_stage2_losses( file_path=output_path + os.sep + 'losses.png', - all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores], - all_labels=['base', 'random', 'omp'], + all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, + omp_with_params_without_weights_test_scores], + all_labels=['base', 'random', 'omp', 'omp_without_weights'], x_value=with_params_extracted_forest_sizes, xlabel='Number of trees extracted', ylabel=experiments_score_metric, diff --git a/code/ensemble_selection.py b/code/ensemble_selection.py new file mode 100644 index 0000000000000000000000000000000000000000..a09a85e9f726e1dbe1df962da53416bbbb0237e3 --- /dev/null +++ b/code/ensemble_selection.py @@ -0,0 +1,161 @@ +# Implemenation of the paper 'Ensemble selection from libraries of models' by Rich Caruana et al. +# A set of trees is trained, then those performing the best on the dev set are added to the forest. + + + +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeRegressor +from sklearn.externals import joblib +import numpy as np +from sklearn.metrics import r2_score +import matplotlib.pyplot as plt + +(data, target) = fetch_california_housing(return_X_y=True) +X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=10000, random_state=2019) +X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=3000, random_state=2019) + +criterion_arr = ["mse"]#, "friedman_mse", "mae"] +splitter_arr = ["best"]#, "random"] +depth_arr = [i for i in range(5, 20, 1)] +min_samples_split_arr = [i for i in range(2, 20, 1)] +min_samples_leaf_arr = [i for i in range(2, 20, 1)] +max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"] + +library = list() + +for criterion in criterion_arr: + for splitter in splitter_arr: + for depth in depth_arr: + for min_samples_split in min_samples_split_arr: + for min_samples_leaf in min_samples_leaf_arr: + for max_features in max_features_arr: + t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=2017) + t.fit(X_train, y_train) + #filename= "t_{}_{}_{}_{}_{}_{}.sav".format(criterion, splitter, depth, min_sample_split, min_sample_leaf, max_features) + library.append(t) + + +print("classifiers", len(library)) + +scores_list = list() + +for classif in library: + r2 = classif.score(X_val, y_val) + scores_list.append(r2) + +print("scores", len(scores_list)) +#print(scores_list) + +########################## + +np_scores_list = np.array(scores_list) +#sort_ind = np.argsort(np_scores_list)[::-1] +#sorted_scores = [scores_list[i] for i in sort_ind] +#sorted_class = [class_list[i] for i in sort_ind] + +#print(sorted_class) +#print(sorted_scores) + +#res = list() +#for s in [10, 20, 30]: +# best_class = sorted_class[:s] +# temp_res = list() +# for r in best_class: +# r2 = r.score(X_test, y_test) +# temp_res.append(r2) +# res.append(np.mean(temp_res)) + +#print("scores on test set", res) + + +########################### + + + + + + + +#for k in range(num_sel_tree-1): +# cand_index = 0 +# best_mean = 0 +# #del scores_sel[-1] +# for j in range(len(scores_list)): +# scores_sel.append(scores_list[j]) +# temp_scores_sel = np.array(scores_sel) +# temp_mean = np.mean(temp_scores_sel) +# if (temp_mean > best_mean): +# best_mean = temp_mean +# cand_index = j +# del scores_sel[-1] +# ens_sel.append(class_list[cand_index]) +# scores_sel.append(scores_list[cand_index]) +# del scores_list[cand_index] +# del class_list[cand_index] +#print("selected models",ens_sel) +#print("selected_scores", scores_sel) + +trees_in_forest = list() +perf_prun_forest = list() + +for num_sel_tree in [2, 4, 6, 8, 10, 15, 20, 30, 40, 50]: + class_list = list(library) + print("class list", len(class_list)) + m = np.argmax(np_scores_list) + ens_sel = [class_list[m]] + #scores_sel = [scores_list[m]] + #del scores_list[m] + temp_pred = class_list[m].predict(X_val) + del class_list[m] + #print("prima di entrare nel for", len(class_list)) + for k in range(num_sel_tree-1): + cand_index = 0 + r2_best = -10000 + #print("ad ogni loop", len(class_list)) + for j in range(len(class_list)): + temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val))) + temp_mean = np.mean(temp_pred, axis=0) + #print("temp pred and temp mean shapes", temp_pred.shape, temp_mean.shape) + r2_temp = r2_score(y_val, temp_mean) + if (r2_temp > r2_best): + r2_best = r2_temp + cand_index = j + temp_pred = np.delete(temp_pred, -1, 0) + #print(temp_pred.shape) + ens_sel.append(class_list[cand_index]) + #scores_sel.append(scores_list[cand_index]) + temp_pred = np.vstack((temp_pred, class_list[cand_index].predict(X_val))) + #del scores_list[cand_index] + del class_list[cand_index] + + #print("ens_sel", len(ens_sel)) + test_list = list() + for mod in ens_sel: + test_pred = mod.predict(X_test) + test_list.append(test_pred) + #print("scores sep", mod.score(X_test, y_test)) + + test_list = np.array(test_list) + #print("test list shape", test_list.shape) + test_mean = np.mean(test_list, axis=0) + #print("test list shape", test_mean.shape) + r2_test = r2_score(test_mean, y_test) + #print(r2_test) + #print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test)) + print(num_sel_tree, r2_test) + trees_in_forest.append(num_sel_tree) + perf_prun_forest.append(r2_test) + + +print(trees_in_forest) +print(perf_prun_forest) +ax = plt.gca() +ax.plot(trees_in_forest, perf_prun_forest, label='ensemble selection') +ax.legend() +#plt.title('fashion mnist') +plt.xlabel('num trees') +plt.ylabel('r2 score') +plt.savefig("ensemble_selection.pdf") +plt.show() diff --git a/code/forest_similarity.py b/code/forest_similarity.py new file mode 100644 index 0000000000000000000000000000000000000000..2f772a93109f26b23c889f4e2eb7b021ae85b3d0 --- /dev/null +++ b/code/forest_similarity.py @@ -0,0 +1,85 @@ +from sklearn.datasets import fetch_california_housing +from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeRegressor +from sklearn.externals import joblib +import numpy as np +from sklearn.metrics import r2_score +from sklearn.ensemble import RandomForestRegressor +import matplotlib.pyplot as plt + +(data, target) = fetch_california_housing(return_X_y=True) +X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=10000, random_state=2019) +X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=3000, random_state=2019) + +num_trees = 100 +prun_for_size=[2, 4, 6, 8, 10, 12, 15, 20] + +randfor = RandomForestRegressor(num_trees, max_depth=7, random_state=2019) +randfor.fit(X_train, y_train) +randfor_pred = randfor.score(X_val, y_val) + +trees_forest = randfor.estimators_ + +trees_in_forest = list() +perf_prun_forest = list() + +for k in range(len(prun_for_size)): + ens_sel = list() + trees_list = list(randfor.estimators_) + #print("dovrebbe essere la taglia iniziale", len(trees_list)) + for j in range(num_trees - prun_for_size[k]): + best_simil = 100000 + cand_ind = 0 + for i in range(len(trees_list)): + lonely_tree = trees_list[i] + del trees_list[i] + val_list = list() + #print("quando poto", len(trees_list)) + for tree in trees_list: + val_pred = tree.predict(X_val) + val_list.append(val_pred) + val_list = np.array(val_list) + val_mean = np.mean(val_list, axis=0) + r2_val = r2_score(val_mean, y_val) + temp_simil = abs(randfor_pred-r2_val) + if (temp_simil < best_simil): + cand_ind = i + best_simil = temp_simil + trees_list.insert(i, lonely_tree) + #print("quando innesto", len(trees_list)) + ens_sel.append(trees_list[cand_ind]) + del trees_list[cand_ind] + + prun_for = list(set(trees_forest) - set(ens_sel)) + print("prun_for", len(prun_for)) + print("trees forest", len(trees_forest)) + print("ens_sel", len(ens_sel)) + + test_list = list() + for mod in prun_for: + test_pred = mod.predict(X_test) + test_list.append(test_pred) + #print("scores sep", mod.score(X_test, y_test)) + + test_list = np.array(test_list) + #print("test list shape", test_list.shape) + test_mean = np.mean(test_list, axis=0) + #print("test list shape", test_mean.shape) + r2_test = r2_score(test_mean, y_test) + #print(r2_test) + #print(ens_sel[0].score(X_test, y_test), ens_sel[1].score(X_test, y_test)) + print(len(prun_for), r2_test) + trees_in_forest.append(len(prun_for)) + perf_prun_forest.append(r2_test) + + +print(trees_in_forest) +print(r2_test) +ax = plt.gca() +ax.plot(trees_in_forest, perf_prun_forest, label='pruned forest') +ax.legend() +#plt.title('fashion mnist') +plt.xlabel('num trees') +plt.ylabel('r2 score') +plt.savefig("pruned_forest.pdf") +plt.show() diff --git a/experiments/iris/stage1/none_with_params.json b/experiments/iris/stage1/none_with_params.json index b26a467d9ad76e6643b39bc952f1a02e956004dc..c6915e3989c24dcee31b74c67415d86a50e50b0f 100644 --- a/experiments/iris/stage1/none_with_params.json +++ b/experiments/iris/stage1/none_with_params.json @@ -13,9 +13,11 @@ "test_size": 0.2, "random_seed_number": 1, "seeds": [ - 58, - 43535, - 234234 + 1, + 2, + 3, + 4, + 5 ], "subsets_used": "train,dev", "normalize_weights": false, diff --git a/experiments/iris/stage1/omp_with_params.json b/experiments/iris/stage1/omp_with_params.json index 35cbb39d2a7d53f87401b9d2ddba05287beeeef9..941788592683f9ffad87edbce1a3924cd7d14895 100644 --- a/experiments/iris/stage1/omp_with_params.json +++ b/experiments/iris/stage1/omp_with_params.json @@ -13,9 +13,11 @@ "test_size": 0.2, "random_seed_number": 1, "seeds": [ - 58, - 43535, - 234234 + 1, + 2, + 3, + 4, + 5 ], "subsets_used": "train,dev", "normalize_weights": false, diff --git a/results/boston/stage4/losses.png b/results/boston/stage4/losses.png index c5d57ce0b386934e9bd2cadcce5b44f8fb8a40d4..0762b7c1057045bb08a9d698e82446baf3558e22 100644 Binary files a/results/boston/stage4/losses.png and b/results/boston/stage4/losses.png differ diff --git a/results/iris/stage1/losses.png b/results/iris/stage1/losses.png index 2a120da925eef72954d16ce98f3b1bb72cdb43e9..2e8d2608b74f13894c5cc006e70d38ee031653a2 100644 Binary files a/results/iris/stage1/losses.png and b/results/iris/stage1/losses.png differ diff --git a/scripts/run_compute_results.sh b/scripts/run_compute_results.sh index f9f130e19c4d467e9d0416a051b8353f071b42dd..d67571d78a9499b75a2c4558a517b01035025beb 100644 --- a/scripts/run_compute_results.sh +++ b/scripts/run_compute_results.sh @@ -1,7 +1,5 @@ -for dataset in diamonds california_housing boston iris diabetes digits linnerud wine breast_cancer olivetti_faces 20newsgroups_vectorized lfw_pairs +seeds='1 2 3' +for dataset in boston iris diabetes digits linnerud wine breast_cancer olivetti_faces 20newsgroups_vectorized lfw_pairs california_housing diamonds do - python code/compute_results.py --stage=1 --experiment_ids 1 2 3 4 5 6 --dataset_name=$dataset --models_dir=models/$dataset/stage1 - python code/compute_results.py --stage=2 --experiment_ids 1 2 3 4 --dataset_name=$dataset --models_dir=models/$dataset/stage2 - python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=$dataset --models_dir=models/$dataset/stage3 python code/compute_results.py --stage=4 --experiment_ids 1 2 3 --dataset_name=$dataset --models_dir=models/$dataset/stage4 done