Skip to content
Snippets Groups Projects

Resolve "Adding new datasets"

Merged Leo Bouscarrat requested to merge 17-adding-new-datasets into master
Compare and Show latest version
1 file
+ 190
192
Compare changes
  • Side-by-side
  • Inline
+ 190
192
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task
from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import time
import datetime
import numpy as np
class Trainer(object):
"""
Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
"""
def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score,
base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score):
"""
:param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
"""
self._dataset = dataset
self._logger = LoggerFactory.create(LOG_PATH, __name__)
self._regression_score_metric = regression_score_metric
self._classification_score_metric = classification_score_metric
self._base_regression_score_metric = base_regression_score_metric
self._base_classification_score_metric = base_classification_score_metric
self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else classification_score_metric.__name__
self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else base_classification_score_metric.__name__
@property
def score_metric_name(self):
return self._score_metric_name
@property
def base_score_metric_name(self):
return self._base_score_metric_name
def init(self, model, subsets_used='train,dev'):
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
else:
self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
self._logger.debug('Fitting the forest on train subset')
elif model.models_parameters.subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = self._dataset.X_dev
self._y_omp = self._dataset.y_dev
self._logger.debug('Fitting the forest on train subset and OMP on dev subset.')
elif model.models_parameters.subsets_used == 'train+dev,train+dev':
self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._X_omp = self._X_forest
self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
self._y_omp = self._y_forest
self._logger.debug('Fitting both the forest and OMP on train+dev subsets.')
elif model.models_parameters.subsets_used == 'train,train+dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
def train(self, model):
"""
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
:return:
"""
self._logger.debug('Training model using train set...')
self._begin_time = time.time()
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
model.fit(
X=self._X_forest,
y=self._y_forest
)
else:
model.fit(
self._X_forest,
self._y_forest,
self._X_omp,
self._y_omp
)
self._end_time = time.time()
def __score_func(self, model, X, y_true, weights=True):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
if type(model) is OmpForestBinaryClassifier:
y_pred = np.sign(y_pred)
y_pred = np.where(y_pred==0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred)
return result
def __score_func_base(self, model, X, y_true):
if type(model) == OmpForestRegressor:
y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
y_pred = model.predict_base_estimator(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestClassifier:
y_pred = model.predict(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred)
return result
def compute_results(self, model, models_dir):
"""
:param model: Object with
:param models_dir: Where the results will be saved
"""
model_weights = ''
if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]:
model_weights = model._omp.coef_
elif type(model) == OmpForestMulticlassClassifier:
model_weights = model._dct_class_omp
elif type(model) == OmpForestBinaryClassifier:
model_weights = model._omp
results = ModelRawResults(
model_weights=model_weights,
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults(
model_weights='',
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir+'_no_weights')
self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier
from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor
from bolsonaro.error_handling.logger_factory import LoggerFactory
from bolsonaro.data.task import Task
from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score
import time
import datetime
import numpy as np
class Trainer(object):
"""
Class capable of fitting any model object to some prepared data then evaluate and save results through the `train` method.
"""
def __init__(self, dataset, regression_score_metric=mean_squared_error, classification_score_metric=accuracy_score,
base_regression_score_metric=mean_squared_error, base_classification_score_metric=accuracy_score):
"""
:param dataset: Object with X_train, y_train, X_dev, y_dev, X_test and Y_test attributes
"""
self._dataset = dataset
self._logger = LoggerFactory.create(LOG_PATH, __name__)
self._regression_score_metric = regression_score_metric
self._classification_score_metric = classification_score_metric
self._base_regression_score_metric = base_regression_score_metric
self._base_classification_score_metric = base_classification_score_metric
self._score_metric_name = regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else classification_score_metric.__name__
self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else base_classification_score_metric.__name__
@property
def score_metric_name(self):
return self._score_metric_name
@property
def base_score_metric_name(self):
return self._base_score_metric_name
def init(self, model, subsets_used='train,dev'):
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
else:
self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
self._logger.debug('Fitting the forest on train subset')
elif model.models_parameters.subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = self._dataset.X_dev
self._y_omp = self._dataset.y_dev
self._logger.debug('Fitting the forest on train subset and OMP on dev subset.')
elif model.models_parameters.subsets_used == 'train+dev,train+dev':
self._X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._X_omp = self._X_forest
self._y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
self._y_omp = self._y_forest
self._logger.debug('Fitting both the forest and OMP on train+dev subsets.')
elif model.models_parameters.subsets_used == 'train,train+dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
self._y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
def train(self, model):
"""
:param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor,
OmpForestBinaryClassifier, OmpForestMulticlassClassifier.
:return:
"""
self._logger.debug('Training model using train set...')
self._begin_time = time.time()
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
model.fit(
X=self._X_forest,
y=self._y_forest
)
else:
model.fit(
self._X_forest,
self._y_forest,
self._X_omp,
self._y_omp
)
self._end_time = time.time()
def __score_func(self, model, X, y_true, weights=True):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
if type(model) is OmpForestBinaryClassifier:
y_pred = np.sign(y_pred)
y_pred = np.where(y_pred==0, 1, y_pred)
result = self._classification_score_metric(y_true, y_pred)
return result
def __score_func_base(self, model, X, y_true):
if type(model) == OmpForestRegressor:
y_pred = model.predict_base_estimator(X)
result = self._base_regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]:
y_pred = model.predict_base_estimator(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) == RandomForestClassifier:
y_pred = model.predict(X)
result = self._base_classification_score_metric(y_true, y_pred)
elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]:
y_pred = model.predict(X)
result = self._base_regression_score_metric(y_true, y_pred)
return result
def compute_results(self, model, models_dir):
"""
:param model: Object with
:param models_dir: Where the results will be saved
"""
model_weights = ''
if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]:
model_weights = model._omp.coef_
elif type(model) == OmpForestMulticlassClassifier:
model_weights = model._dct_class_omp
elif type(model) == OmpForestBinaryClassifier:
model_weights = model._omp
results = ModelRawResults(
model_weights=model_weights,
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults(
model_weights='',
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir+'_no_weights')
self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
Loading