diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index fc289afd1f8301197f5b1dd8be3bb134deca4a91..7070126e2a9a8f449757bdab9381b4bffab99b2d 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -1,192 +1,190 @@ -from bolsonaro.models.model_raw_results import ModelRawResults -from bolsonaro.models.omp_forest_regressor import OmpForestRegressor -from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier -from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor -from bolsonaro.error_handling.logger_factory import LoggerFactory -from bolsonaro.data.task import Task -from . import LOG_PATH - -from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier -from sklearn.metrics import mean_squared_error, accuracy_score -import time -import datetime -import numpy as np - - -class Trainer(object): - """ - Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. - """ - - def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score, - base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score): - """ - - :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes - """ - self._dataset = dataset - self._logger = LoggerFactory.create(LOG_PATH, __name__) - self._regression_score_metric = regression_score_metric - self._classification_score_metric = classification_score_metric - self._base_regression_score_metric = base_regression_score_metric - self._base_classification_score_metric = base_classification_score_metric - self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ - else classification_score_metric.__name__ - self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ - else base_classification_score_metric.__name__ - - @property - def score_metric_name(self): - return self._score_metric_name - - @property - def base_score_metric_name(self): - return self._base_score_metric_name - - def init(self, model, subsets_used='train,dev'): - if type(model) in [RandomForestRegressor, RandomForestClassifier]: - if subsets_used == 'train,dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - else: - self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - self._logger.debug('Fitting the forest on train subset') - elif model.models_parameters.subsets_used == 'train,dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - self._X_omp = self._dataset.X_dev - self._y_omp = self._dataset.y_dev - self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') - elif model.models_parameters.subsets_used == 'train+dev,train+dev': - self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._X_omp = self._X_forest - self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - self._y_omp = self._y_forest - self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') - elif model.models_parameters.subsets_used == 'train,train+dev': - self._X_forest = self._dataset.X_train - self._y_forest = self._dataset.y_train - self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) - self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) - else: - raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) - - def train(self, model): - """ - :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, - OmpForestBinaryClassifier, OmpForestMulticlassClassifier. - :return: - """ - - self._logger.debug('Training model using train set...') - self._begin_time = time.time() - if type(model) in [RandomForestRegressor, RandomForestClassifier]: - model.fit( - X=self._X_forest, - y=self._y_forest - ) - else: - model.fit( - self._X_forest, - self._y_forest, - self._X_omp, - self._y_omp - ) - self._end_time = time.time() - - def __score_func(self, model, X, y_true, weights=True): - if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: - if weights: - y_pred = model.predict(X) - else: - y_pred = model.predict_no_weights(X) - result = self._regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: - if weights: - y_pred = model.predict(X) - else: - y_pred = model.predict_no_weights(X) - if type(model) is OmpForestBinaryClassifier: - y_pred = np.sign(y_pred) - y_pred = np.where(y_pred==0, 1, y_pred) - result = self._classification_score_metric(y_true, y_pred) - return result - - def __score_func_base(self, model, X, y_true): - if type(model) == OmpForestRegressor: - y_pred = model.predict_base_estimator(X) - result = self._base_regression_score_metric(y_true, y_pred) - elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: - y_pred = model.predict_base_estimator(X) - result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) == RandomForestClassifier: - y_pred = model.predict(X) - result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: - y_pred = model.predict(X) - result = self._base_regression_score_metric(y_true, y_pred) - return result - - def compute_results(self, model, models_dir): - """ - :param model: Object with - :param models_dir: Where the results will be saved - """ - - model_weights = '' - if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]: - model_weights = model._omp.coef_ - elif type(model) == OmpForestMulticlassClassifier: - model_weights = model._dct_class_omp - elif type(model) == OmpForestBinaryClassifier: - model_weights = model._omp - - results = ModelRawResults( - model_weights=model_weights, - training_time=self._end_time - self._begin_time, - datetime=datetime.datetime.now(), - train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), - dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), - train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), - dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), - test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), - score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name - ) - results.save(models_dir) - self._logger.info("Base performance on test: {}".format(results.test_score_base)) - self._logger.info("Performance on test: {}".format(results.test_score)) - - self._logger.info("Base performance on train: {}".format(results.train_score_base)) - self._logger.info("Performance on train: {}".format(results.train_score)) - - self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) - self._logger.info("Performance on dev: {}".format(results.dev_score)) - - if type(model) not in [RandomForestRegressor, RandomForestClassifier]: - results = ModelRawResults( - model_weights='', - training_time=self._end_time - self._begin_time, - datetime=datetime.datetime.now(), - train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), - dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), - test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), - train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), - dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), - test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), - score_metric=self._score_metric_name, - base_score_metric=self._base_score_metric_name - ) - results.save(models_dir+'_no_weights') - self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) - self._logger.info("Performance on test: {}".format(results.test_score)) - - self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) - self._logger.info("Performance on train: {}".format(results.train_score)) - - self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) - self._logger.info("Performance on dev: {}".format(results.dev_score)) - - +from bolsonaro.models.model_raw_results import ModelRawResults +from bolsonaro.models.omp_forest_regressor import OmpForestRegressor +from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier +from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.error_handling.logger_factory import LoggerFactory +from bolsonaro.data.task import Task +from . import LOG_PATH + +from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier +from sklearn.metrics import mean_squared_error, accuracy_score +import time +import datetime +import numpy as np + + +class Trainer(object): + """ + Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method. + """ + + def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score, + base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score): + """ + + :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes + """ + self._dataset = dataset + self._logger = LoggerFactory.create(LOG_PATH, __name__) + self._regression_score_metric = regression_score_metric + self._classification_score_metric = classification_score_metric + self._base_regression_score_metric = base_regression_score_metric + self._base_classification_score_metric = base_classification_score_metric + self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ + else classification_score_metric.__name__ + self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \ + else base_classification_score_metric.__name__ + + @property + def score_metric_name(self): + return self._score_metric_name + + @property + def base_score_metric_name(self): + return self._base_score_metric_name + + def init(self, model, subsets_used='train,dev'): + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + if subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + else: + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._logger.debug('Fitting the forest on train subset') + elif model.models_parameters.subsets_used == 'train,dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = self._dataset.X_dev + self._y_omp = self._dataset.y_dev + self._logger.debug('Fitting the forest on train subset and OMP on dev subset.') + elif model.models_parameters.subsets_used == 'train+dev,train+dev': + self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._X_omp = self._X_forest + self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + self._y_omp = self._y_forest + self._logger.debug('Fitting both the forest and OMP on train+dev subsets.') + elif model.models_parameters.subsets_used == 'train,train+dev': + self._X_forest = self._dataset.X_train + self._y_forest = self._dataset.y_train + self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev]) + self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev]) + else: + raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) + + def train(self, model): + """ + :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, + OmpForestBinaryClassifier, OmpForestMulticlassClassifier. + :return: + """ + + self._logger.debug('Training model using train set...') + self._begin_time = time.time() + if type(model) in [RandomForestRegressor, RandomForestClassifier]: + model.fit( + X=self._X_forest, + y=self._y_forest + ) + else: + model.fit( + self._X_forest, + self._y_forest, + self._X_omp, + self._y_omp + ) + self._end_time = time.time() + + def __score_func(self, model, X, y_true, weights=True): + if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: + if weights: + y_pred = model.predict(X) + else: + y_pred = model.predict_no_weights(X) + result = self._regression_score_metric(y_true, y_pred) + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]: + if weights: + y_pred = model.predict(X) + else: + y_pred = model.predict_no_weights(X) + if type(model) is OmpForestBinaryClassifier: + y_pred = np.sign(y_pred) + y_pred = np.where(y_pred==0, 1, y_pred) + result = self._classification_score_metric(y_true, y_pred) + return result + + def __score_func_base(self, model, X, y_true): + if type(model) == OmpForestRegressor: + y_pred = model.predict_base_estimator(X) + result = self._base_regression_score_metric(y_true, y_pred) + elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: + y_pred = model.predict_base_estimator(X) + result = self._base_classification_score_metric(y_true, y_pred) + elif type(model) == RandomForestClassifier: + y_pred = model.predict(X) + result = self._base_classification_score_metric(y_true, y_pred) + elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: + y_pred = model.predict(X) + result = self._base_regression_score_metric(y_true, y_pred) + return result + + def compute_results(self, model, models_dir): + """ + :param model: Object with + :param models_dir: Where the results will be saved + """ + + model_weights = '' + if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]: + model_weights = model._omp.coef_ + elif type(model) == OmpForestMulticlassClassifier: + model_weights = model._dct_class_omp + elif type(model) == OmpForestBinaryClassifier: + model_weights = model._omp + + results = ModelRawResults( + model_weights=model_weights, + training_time=self._end_time - self._begin_time, + datetime=datetime.datetime.now(), + train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train), + dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test), + train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), + dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), + test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), + score_metric=self._score_metric_name, + base_score_metric=self._base_score_metric_name + ) + results.save(models_dir) + self._logger.info("Base performance on test: {}".format(results.test_score_base)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train: {}".format(results.train_score_base)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev: {}".format(results.dev_score_base)) + self._logger.info("Performance on dev: {}".format(results.dev_score)) + + if type(model) not in [RandomForestRegressor, RandomForestClassifier]: + results = ModelRawResults( + model_weights='', + training_time=self._end_time - self._begin_time, + datetime=datetime.datetime.now(), + train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False), + dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False), + test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False), + train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train), + dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev), + test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test), + score_metric=self._score_metric_name, + base_score_metric=self._base_score_metric_name + ) + results.save(models_dir+'_no_weights') + self._logger.info("Base performance on test without weights: {}".format(results.test_score_base)) + self._logger.info("Performance on test: {}".format(results.test_score)) + + self._logger.info("Base performance on train without weights: {}".format(results.train_score_base)) + self._logger.info("Performance on train: {}".format(results.train_score)) + + self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base)) + self._logger.info("Performance on dev: {}".format(results.dev_score))