Skip to content
Snippets Groups Projects
Select Git revision
  • 881106aee61f6e431ae764d01f064bb4418427fa
  • master default protected
  • correlation
  • 24-non-negative-omp
  • 15-integration-sota
  • 20-coherence-des-arbres-de-predictions
  • 19-add-some-tests
  • 13-visualization
  • 17-adding-new-datasets
  • 12-experiment-pipeline
  • 14-correction-of-multiclass-classif
  • archive/10-gridsearching-of-the-base-forest
  • archive/farah_notation_and_related_work
  • archive/wip_clean_scripts
  • archive/4-implement-omp_forest_classifier
  • archive/5-add-plots-2
  • archive/Leo_Add_first_notebook
17 results

trainer.py

Blame
  • user avatar
    Charly Lamothe authored
    881106ae
    History
    trainer.py 7.95 KiB
    from bolsonaro.models.model_raw_results import ModelRawResults
    from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
    from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
    from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
    from bolsonaro.error_handling.logger_factory import LoggerFactory
    from bolsonaro.data.task import Task
    from . import LOG_PATH
    
    from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
    from sklearn.metrics import mean_squared_error, accuracy_score
    import time
    import datetime
    import numpy as np
    
    
    class Trainer(object):
        """
        Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
        """
    
        def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score,
            base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score):
            """
    
            :param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
            """
            self._dataset = dataset
            self._logger = LoggerFactory.create(LOG_PATH, __name__)
            self._regression_score_metric = regression_score_metric
            self._classification_score_metric = classification_score_metric
            self._base_regression_score_metric = base_regression_score_metric
            self._base_classification_score_metric = base_classification_score_metric
            self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
                else classification_score_metric.__name__
            self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
                else base_classification_score_metric.__name__
    
        @property
        def score_metric_name(self):
            return self._score_metric_name
    
        @property
        def base_score_metric_name(self):
            return self._base_score_metric_name
    
        def init(self, model, subsets_used='train,dev'):
            if type(model) in [RandomForestRegressor, RandomForestClassifier]:
                if subsets_used == 'train,dev':
                    self._X_forest = self._dataset.X_train
                    self._y_forest = self._dataset.y_train
                else:
                    self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
                    self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])    
                self._logger.debug('Fitting the forest on train subset')
            elif model.models_parameters.subsets_used == 'train,dev':
                self._X_forest = self._dataset.X_train
                self._y_forest = self._dataset.y_train
                self._X_omp = self._dataset.X_dev
                self._y_omp = self._dataset.y_dev
                self._logger.debug('Fitting the forest on train subset and OMP on dev subset.')
            elif model.models_parameters.subsets_used == 'train+dev,train+dev':
                self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
                self._X_omp = self._X_forest
                self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
                self._y_omp = self._y_forest
                self._logger.debug('Fitting both the forest and OMP on train+dev subsets.')
            elif model.models_parameters.subsets_used == 'train,train+dev':
                self._X_forest = self._dataset.X_train
                self._y_forest = self._dataset.y_train
                self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
                self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
            else:
                raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
    
        def train(self, model):
            """
            :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
                OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
            :return:
            """
    
            self._logger.debug('Training model using train set...')
            self._begin_time = time.time()
            if type(model) in [RandomForestRegressor, RandomForestClassifier]:
                model.fit(
                    X=self._X_forest,
                    y=self._y_forest
                )
            else:
                model.fit(
                    self._X_forest,
                    self._y_forest,
                    self._X_omp,
                    self._y_omp
                )
            self._end_time = time.time()
    
        def __score_func(self, model, X, y_true):
            if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
                y_pred = model.predict(X)
                result = self._regression_score_metric(y_true, y_pred)
            elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
                y_pred = model.predict(X)
                if type(model) is OmpForestBinaryClassifier:
                    y_pred = y_pred.round()
                result = self._classification_score_metric(y_true, y_pred)
            return result
    
        def __score_func_base(self, model, X, y_true):
            if type(model) == OmpForestRegressor:
                y_pred = model.predict_base_estimator(X)
                result = self._base_regression_score_metric(y_true, y_pred)
            elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
                y_pred = model.predict_base_estimator(X)
                result = self._base_classification_score_metric(y_true, y_pred)
            elif type(model) == RandomForestClassifier:
                y_pred = model.predict(X)
                result = self._base_classification_score_metric(y_true, y_pred)
            elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
                y_pred = model.predict(X)
                result = self._base_regression_score_metric(y_true, y_pred)
            return result
    
        def compute_results(self, model, models_dir):
            """
            :param model: Object with
            :param models_dir: Where the results will be saved
            """
    
            model_weights = ''
            if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]:
                model_weights = model._omp.coef_
            elif type(model) == OmpForestMulticlassClassifier:
                model_weights = model._dct_class_omp
            elif type(model) == OmpForestBinaryClassifier:
                model_weights = model._omp
    
            results = ModelRawResults(
                model_weights=model_weights,
                training_time=self._end_time - self._begin_time,
                datetime=datetime.datetime.now(),
                train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
                dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev),
                test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
                train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
                dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
                test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
                score_metric=self._score_metric_name,
                base_score_metric=self._base_score_metric_name
            )
            results.save(models_dir)
            self._logger.info("Base performance on test: {}".format(results.test_score_base))
            self._logger.info("Performance on test: {}".format(results.test_score))
    
            self._logger.info("Base performance on train: {}".format(results.train_score_base))
            self._logger.info("Performance on train: {}".format(results.train_score))
    
            self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
            self._logger.info("Performance on dev: {}".format(results.dev_score))