Skip to content
Snippets Groups Projects
Commit 5e3368e9 authored by Luc Giffon's avatar Luc Giffon
Browse files

Merge branch '15-integration-sota' of gitlab.lis-lab.fr:luc.giffon/bolsonaro...

Merge branch '15-integration-sota' of gitlab.lis-lab.fr:luc.giffon/bolsonaro into 15-integration-sota
parents f0f48756 0ea777e7
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
......@@ -10,7 +10,10 @@ class ModelRawResults(object):
datetime, train_score, dev_score, test_score,
train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric,
coherence='', correlation=''):
#coherence='', correlation=''):
train_coherence='', dev_coherence='', test_coherence='',
train_correlation='', dev_correlation='', test_correlation='',
train_strength='', dev_strength='', test_strength=''):
self._model_weights = model_weights
self._training_time = training_time
......@@ -23,8 +26,17 @@ class ModelRawResults(object):
self._test_score_base = test_score_base
self._score_metric = score_metric
self._base_score_metric = base_score_metric
self._coherence = coherence
self._correlation = correlation
"""self._coherence = coherence
self._correlation = correlation"""
self._train_coherence = train_coherence
self._dev_coherence = dev_coherence
self._test_coherence = test_coherence
self._train_correlation = train_correlation
self._dev_correlation = dev_correlation
self._test_correlation = test_correlation
self._train_strength = train_strength
self._dev_strength = dev_strength
self._test_strength = test_strength
@property
def model_weights(self):
......@@ -70,13 +82,49 @@ class ModelRawResults(object):
def base_score_metric(self):
return self._base_score_metric
@property
"""@property
def coherence(self):
return self._coherence
@property
def correlation(self):
return self._correlation
return self._correlation"""
@property
def train_coherence(self):
return self._train_coherence
@property
def dev_coherence(self):
return self._dev_coherence
@property
def test_coherence(self):
return self._test_coherence
@property
def train_correlation(self):
return self._train_correlation
@property
def dev_correlation(self):
return self._dev_correlation
@property
def test_correlation(self):
return self._test_correlation
@property
def train_strength(self):
return self._train_strength
@property
def dev_strength(self):
return self._dev_strength
@property
def test_strength(self):
return self._test_strength
def save(self, models_dir):
if not os.path.exists(models_dir):
......
......@@ -39,7 +39,6 @@ class Trainer(object):
else classification_score_metric.__name__
self._base_score_metric_name = base_regression_score_metric.__name__ if dataset.task == Task.REGRESSION \
else base_classification_score_metric.__name__
self._selected_trees = ''
@property
def score_metric_name(self):
......@@ -98,7 +97,6 @@ class Trainer(object):
X=self._X_forest,
y=self._y_forest
)
self._selected_trees = model.estimators_
else:
if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier] and \
use_distillation:
......@@ -154,14 +152,17 @@ class Trainer(object):
result = self._base_regression_score_metric(y_true, y_pred)
return result
def _evaluate_predictions(self, model, X, aggregation_function):
predictions = np.array([tree.predict(X) for tree in self._selected_trees])
def _evaluate_predictions(self, X, aggregation_function, selected_trees):
predictions = np.array([tree.predict(X) for tree in selected_trees])
predictions = normalize(predictions)
return aggregation_function(np.abs((predictions @ predictions.T - np.eye(len(predictions)))))
def compute_results(self, model, models_dir):
def _compute_forest_strength(self, X, y, metric_function, selected_trees):
return np.mean([metric_function(y, tree.predict(X)) for tree in selected_trees])
def compute_results(self, model, models_dir, subsets_used='train+dev,train+dev'):
"""
:param model: Object with
:param models_dir: Where the results will be saved
......@@ -177,30 +178,70 @@ class Trainer(object):
if type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor,
SimilarityForestClassifier, KMeansForestClassifier, EnsembleSelectionForestClassifier]:
self._selected_trees = model.selected_trees
selected_trees = model.selected_trees
elif type(model) in [OmpForestRegressor, OmpForestMulticlassClassifier, OmpForestBinaryClassifier]:
self._selected_trees = np.asarray(model.forest)[model._omp.coef_ != 0]
selected_trees = np.asarray(model.forest)[model._omp.coef_ != 0]
elif type(model) in [RandomForestRegressor, RandomForestClassifier]:
self._selected_trees = model.estimators_
selected_trees = model.estimators_
if len(self._selected_trees) > 0:
if len(selected_trees) > 0:
target_selected_tree = int(os.path.split(models_dir)[-1])
if target_selected_tree != len(selected_trees):
raise ValueError(f'Invalid selected tree number target_selected_tree:{target_selected_tree} - len(selected_trees):{len(selected_trees)}')
with open(os.path.join(models_dir, 'selected_trees.pickle'), 'wb') as output_file:
pickle.dump(self._selected_trees, output_file)
pickle.dump(selected_trees, output_file)
strength_metric = self._regression_score_metric if self._dataset.task == Task.REGRESSION else self._classification_score_metric
# Reeeally dirty to put that here but otherwise it's not thread safe...
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
if subsets_used == 'train,dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
else:
X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
elif model.models_parameters.subsets_used == 'train,dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
X_omp = self._dataset.X_dev
y_omp = self._dataset.y_dev
elif model.models_parameters.subsets_used == 'train+dev,train+dev':
X_forest = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
X_omp = X_forest
y_forest = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
y_omp = y_forest
elif model.models_parameters.subsets_used == 'train,train+dev':
X_forest = self._dataset.X_train
y_forest = self._dataset.y_train
X_omp = np.concatenate([self._dataset.X_train, self._dataset.X_dev])
y_omp = np.concatenate([self._dataset.y_train, self._dataset.y_dev])
else:
raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used))
results = ModelRawResults(
model_weights=model_weights,
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev),
train_score=self.__score_func(model, X_forest, y_forest),
dev_score=self.__score_func(model, X_omp, y_omp),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
train_score_base=self.__score_func_base(model, X_forest, y_forest),
dev_score_base=self.__score_func_base(model, X_omp, y_omp),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name,
coherence=self._evaluate_predictions(model, self._dataset.X_train, aggregation_function=np.max),
correlation=self._evaluate_predictions(model, self._dataset.X_train, aggregation_function=np.mean)
train_coherence=self._evaluate_predictions(X_forest, aggregation_function=np.max, selected_trees=selected_trees),
dev_coherence=self._evaluate_predictions(X_omp, aggregation_function=np.max, selected_trees=selected_trees),
test_coherence=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.max, selected_trees=selected_trees),
train_correlation=self._evaluate_predictions(X_forest, aggregation_function=np.mean, selected_trees=selected_trees),
dev_correlation=self._evaluate_predictions(X_omp, aggregation_function=np.mean, selected_trees=selected_trees),
test_correlation=self._evaluate_predictions(self._dataset.X_test, aggregation_function=np.mean, selected_trees=selected_trees),
train_strength=self._compute_forest_strength(X_forest, y_forest, strength_metric, selected_trees),
dev_strength=self._compute_forest_strength(X_omp, y_omp, strength_metric, selected_trees),
test_strength=self._compute_forest_strength(self._dataset.X_test, self._dataset.y_test, strength_metric, selected_trees)
)
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_base))
......@@ -212,16 +253,20 @@ class Trainer(object):
self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
self._logger.info(f'test_coherence: {results.test_coherence}')
self._logger.info(f'test_correlation: {results.test_correlation}')
self._logger.info(f'test_strength: {results.test_strength}')
if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults(
model_weights='',
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False),
train_score=self.__score_func(model, X_forest, y_forest, False),
dev_score=self.__score_func(model, X_omp, y_omp, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
train_score_base=self.__score_func_base(model, X_forest, y_forest),
dev_score_base=self.__score_func_base(model, X_omp, y_omp),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
......
This diff is collapsed.
......@@ -8,8 +8,9 @@ from tqdm import tqdm
if __name__ == "__main__":
models_source_path = 'models'
models_destination_path = 'bolsonaro_models_25-03-20'
datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds',
'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs']
#datasets = ['boston', 'diabetes', 'linnerud', 'breast_cancer', 'california_housing', 'diamonds',
# 'steel-plates', 'kr-vs-kp', 'kin8nm', 'spambase', 'gamma', 'lfw_pairs']
datasets = ['kin8nm']
pathlib.Path(models_destination_path).mkdir(parents=True, exist_ok=True)
......
......@@ -66,11 +66,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
extraction_strategy=parameters['extraction_strategy']
)
pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters)
pretraned_trainer = Trainer(dataset)
pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
pretrained_trainer = Trainer(dataset)
pretrained_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used'])
pretrained_estimator.fit(
X=pretraned_trainer._X_forest,
y=pretraned_trainer._y_forest
X=pretrained_trainer._X_forest,
y=pretrained_trainer._y_forest
)
else:
pretrained_estimator = None
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment