Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • luc.giffon/bolsonaro
1 result
Select Git revision
Show changes
Commits on Source (10)
......@@ -6,12 +6,12 @@ import datetime
class ModelRawResults(object):
def __init__(self, model_object, training_time,
def __init__(self, model_weights, training_time,
datetime, train_score, dev_score, test_score,
train_score_base, dev_score_base,
test_score_base, score_metric, base_score_metric):
self._model_object = model_object
self._model_weights = model_weights
self._training_time = training_time
self._datetime = datetime
self._train_score = train_score
......@@ -24,8 +24,8 @@ class ModelRawResults(object):
self._base_score_metric = base_score_metric
@property
def model_object(self):
return self.model_object
def model_weights(self):
return self.model_weights
@property
def training_time(self):
......@@ -68,10 +68,12 @@ class ModelRawResults(object):
return self._base_score_metric
def save(self, models_dir):
if not os.path.exists(models_dir):
os.mkdir(models_dir)
save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle',
self.__dict__)
@staticmethod
def load(models_dir):
def load(models_dir):
return load_obj_from_pickle(models_dir + os.sep + 'model_raw_results.pickle',
ModelRawResults)
......@@ -8,6 +8,7 @@ from sklearn.base import BaseEstimator
class OmpForest(BaseEstimator, metaclass=ABCMeta):
def __init__(self, models_parameters, base_forest_estimator):
self._base_forest_estimator = base_forest_estimator
self._models_parameters = models_parameters
......@@ -24,7 +25,6 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
return self._base_forest_estimator.score(X, y)
def _base_estimator_predictions(self, X):
# We need to use predict_proba to get the probabilities of each class
return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T
@property
......@@ -96,6 +96,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
pass
class SingleOmpForest(OmpForest):
def __init__(self, models_parameters, base_forest_estimator):
# fit_intercept shouldn't be set to False as the data isn't necessarily centered here
# normalization is handled outsite OMP
......@@ -123,3 +124,24 @@ class SingleOmpForest(OmpForest):
forest_predictions /= self._forest_norms
return self._make_omp_weighted_prediction(forest_predictions, self._omp, self._models_parameters.normalize_weights)
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = self._base_estimator_predictions(X).T
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
weights = self._omp.coef_
omp_trees_indices = np.nonzero(weights)
select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0)
return select_trees
......@@ -106,6 +106,36 @@ class OmpForestMulticlassClassifier(OmpForest):
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def predict_no_weights(self, X):
"""
Apply the SingleOmpForest to X without using the weights.
Make all the base tree predictions
:param X: a Forest
:return: a np.array of the predictions of the entire forest
"""
forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
if self._models_parameters.normalize_D:
forest_predictions /= self._forest_norms
label_names = []
preds = []
num_class = 0
for class_label, omp_class in self._dct_class_omp.items():
weights = omp_class.coef_
omp_trees_indices = np.nonzero(weights)
label_names.append(class_label)
atoms_binary = (forest_predictions[num_class].T - 0.5) * 2 # centré réduit de 0/1 à -1/1
preds.append(np.sum(atoms_binary[omp_trees_indices], axis=0))
num_class += 1
preds = np.array(preds).T
max_preds = np.argmax(preds, axis=1)
return np.array(label_names)[max_preds]
def score(self, X, y, metric=DEFAULT_SCORE_METRIC):
predictions = self.predict(X)
......
......@@ -95,12 +95,18 @@ class Trainer(object):
)
self._end_time = time.time()
def __score_func(self, model, X, y_true):
def __score_func(self, model, X, y_true, weights=True):
if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]:
y_pred = model.predict(X)
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
result = self._regression_score_metric(y_true, y_pred)
elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier, RandomForestClassifier]:
y_pred = model.predict(X)
if weights:
y_pred = model.predict(X)
else:
y_pred = model.predict_no_weights(X)
if type(model) is OmpForestBinaryClassifier:
y_pred = y_pred.round()
result = self._classification_score_metric(y_true, y_pred)
......@@ -126,8 +132,17 @@ class Trainer(object):
:param model: Object with
:param models_dir: Where the results will be saved
"""
model_weights = ''
if type(model) in [OmpForestRegressor, OmpForestBinaryClassifier]:
model_weights = model._omp.coef_
elif type(model) == OmpForestMulticlassClassifier:
model_weights = model._dct_class_omp
elif type(model) == OmpForestBinaryClassifier:
model_weights = model._omp
results = ModelRawResults(
model_object='',
model_weights=model_weights,
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train),
......@@ -148,3 +163,29 @@ class Trainer(object):
self._logger.info("Base performance on dev: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
if type(model) not in [RandomForestRegressor, RandomForestClassifier]:
results = ModelRawResults(
model_object='',
training_time=self._end_time - self._begin_time,
datetime=datetime.datetime.now(),
train_score=self.__score_func(model, self._dataset.X_train, self._dataset.y_train, False),
dev_score=self.__score_func(model, self._dataset.X_dev, self._dataset.y_dev, False),
test_score=self.__score_func(model, self._dataset.X_test, self._dataset.y_test, False),
train_score_base=self.__score_func_base(model, self._dataset.X_train, self._dataset.y_train),
dev_score_base=self.__score_func_base(model, self._dataset.X_dev, self._dataset.y_dev),
test_score_base=self.__score_func_base(model, self._dataset.X_test, self._dataset.y_test),
score_metric=self._score_metric_name,
base_score_metric=self._base_score_metric_name
)
results.save(models_dir+'_no_weights')
self._logger.info("Base performance on test without weights: {}".format(results.test_score_base))
self._logger.info("Performance on test: {}".format(results.test_score))
self._logger.info("Base performance on train without weights: {}".format(results.train_score_base))
self._logger.info("Performance on train: {}".format(results.train_score))
self._logger.info("Base performance on dev without weights: {}".format(results.dev_score_base))
self._logger.info("Performance on dev: {}".format(results.dev_score))
......@@ -109,16 +109,16 @@ class Plotter(object):
fig, ax = plt.subplots()
n = len(all_experiment_scores)
nb_experiments = len(all_experiment_scores)
"""
Get as many different colors from the specified cmap (here nipy_spectral)
as there are curve to plot.
"""
colors = Plotter.get_colors_from_cmap(n)
colors = Plotter.get_colors_from_cmap(nb_experiments)
# For each curve to plot
for i in range(n):
# For each curve to plot
for i in range(nb_experiments):
# Retreive the scores in a list for each seed
experiment_scores = list(all_experiment_scores[i].values())
# Compute the mean and the std for the CI
......
......@@ -17,7 +17,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes'
return len(os.listdir(extracted_forest_sizes_root_path))
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id):
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
......@@ -28,6 +28,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_train_scores = dict()
experiment_dev_scores = dict()
experiment_test_scores = dict()
experiment_weights = dict()
all_extracted_forest_sizes = list()
# Used to check if all losses were computed using the same metric (it should be the case)
......@@ -44,14 +45,19 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_train_scores[seed] = list()
experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list()
experiment_weights[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
extracted_forest_sizes.sort(key=int)
all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes)))
for extracted_forest_size in extracted_forest_sizes:
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
if weights:
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
else:
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size + '_no_weights'
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Save the scores
......@@ -60,6 +66,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_test_scores[seed].append(model_raw_results.test_score)
# Save the metric
experiment_score_metrics.append(model_raw_results.score_metric)
# Save the weights
#experiment_weights[seed].append(model_raw_results.model_weights)
# Sanity checks
if len(set(experiment_score_metrics)) > 1:
......@@ -67,7 +75,8 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1:
raise ValueError("The extracted forest sizes aren't the sames across seeds.")
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, all_extracted_forest_sizes[0], experiment_score_metrics[0]
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, \
all_extracted_forest_sizes[0], experiment_score_metrics[0]#, experiment_weights
def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
......@@ -120,6 +129,7 @@ if __name__ == "__main__":
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
DEFAULT_PLOT_WEIGHT_DENSITY = False
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].')
......@@ -130,6 +140,7 @@ if __name__ == "__main__":
parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--plot_weight_density', action='store_true', default=DEFAULT_PLOT_WEIGHT_DENSITY, help='Plot the weight density. Only working for regressor models for now.')
args = parser.parse_args()
if args.stage not in list(range(1, 6)):
......@@ -347,9 +358,17 @@ if __name__ == "__main__":
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
"""omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric, experiment_weights = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])"""
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
#omp_with_params_without_weights
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2], weights=False)
"""# base_with_params
logger.info('Loading base_with_params experiment scores 2...')
......@@ -369,17 +388,21 @@ if __name__ == "__main__":
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4')
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4_fix')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores],
all_labels=['base', 'random', 'omp'],
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
omp_with_params_without_weights_test_scores],
all_labels=['base', 'random', 'omp', 'omp_without_weights'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
# experiment_weights
#Plotter.weight_density(experiment_weights, output_path + os.sep + 'weight_density.png')
else:
raise ValueError('This stage number is not supported yet, but it will be!')
......
......@@ -13,9 +13,11 @@
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
58,
43535,
234234
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
......
......@@ -13,9 +13,11 @@
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
58,
43535,
234234
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
......
results/boston/stage4/losses.png

43.7 KiB | W: 0px | H: 0px

results/boston/stage4/losses.png

110 KiB | W: 0px | H: 0px

results/boston/stage4/losses.png
results/boston/stage4/losses.png
results/boston/stage4/losses.png
results/boston/stage4/losses.png
  • 2-up
  • Swipe
  • Onion skin
results/iris/stage1/losses.png

64.7 KiB | W: 0px | H: 0px

results/iris/stage1/losses.png

66.1 KiB | W: 0px | H: 0px

results/iris/stage1/losses.png
results/iris/stage1/losses.png
results/iris/stage1/losses.png
results/iris/stage1/losses.png
  • 2-up
  • Swipe
  • Onion skin
for dataset in diamonds california_housing boston iris diabetes digits linnerud wine breast_cancer olivetti_faces 20newsgroups_vectorized lfw_pairs
seeds='1 2 3'
for dataset in boston iris diabetes digits linnerud wine breast_cancer olivetti_faces 20newsgroups_vectorized lfw_pairs california_housing diamonds
do
python code/compute_results.py --stage=1 --experiment_ids 1 2 3 4 5 6 --dataset_name=$dataset --models_dir=models/$dataset/stage1
python code/compute_results.py --stage=2 --experiment_ids 1 2 3 4 --dataset_name=$dataset --models_dir=models/$dataset/stage2
python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=$dataset --models_dir=models/$dataset/stage3
python code/compute_results.py --stage=4 --experiment_ids 1 2 3 --dataset_name=$dataset --models_dir=models/$dataset/stage4
done
#!/bin/bash
core_number=5
walltime=1:00
seeds='1 2 3'
for dataset in diabetes #diamonds california_housing boston linnerud
do
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev"
oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev"
done