From 880ff78f34b20eb515492cc0542777f31e5f666f Mon Sep 17 00:00:00 2001
From: Charly Lamothe <charly.lamothe@univ-amu.fr>
Date: Wed, 18 Dec 2019 02:27:38 +0100
Subject: [PATCH] - Add an option to not use the best hyperparameters file; -
 Definitely use the correct forest size (either the one from best
 hyperparameters or the one specified in parameter); - Use a number of
 extracted forest sizes proportional as the forest size instead of fixed
 forest size; - Add an option to save the current command line name instead of
 using the unamed directory; - Add new california housing dataset best
 hyperparameters, and convert all value types that are number from string to
 int/float in other best hyperparameter files; - Remove useless code from
 compute_results.py in prevision of the changes; - Before best hyperparameters
 saving, save number as int or float instead of string; - Add job_number
 option for parallelisation in both train.py and compute_hyperparameters.py
 scripts; - Clean-up TODO list.

---
 TODO.md                                       |   6 +-
 code/bolsonaro/data/dataset_loader.py         |   2 +-
 code/bolsonaro/utils.py                       |  15 +-
 code/compute_hyperparameters.py               |  10 +-
 code/compute_results.py                       | 148 ++----------------
 code/train.py                                 |  55 +++++--
 experiments/boston/stage1/params.json         |   6 +-
 experiments/breast_cancer/stage1/params.json  |   6 +-
 .../california_housing/stage1/params.json     |  16 ++
 experiments/diabetes/stage1/params.json       |   6 +-
 experiments/digits/stage1/params.json         |   6 +-
 experiments/iris/stage1/params.json           |   6 +-
 .../iris/stage1/with_best_params_16.json      |  36 +++++
 .../iris/stage1/wo_best_params_17.json        |  36 +++++
 experiments/linnerud/stage1/params.json       |   6 +-
 experiments/olivetti_faces/stage1/params.json |  28 ++++
 experiments/wine/stage1/params.json           |   6 +-
 17 files changed, 219 insertions(+), 175 deletions(-)
 create mode 100644 experiments/california_housing/stage1/params.json
 create mode 100644 experiments/iris/stage1/with_best_params_16.json
 create mode 100644 experiments/iris/stage1/wo_best_params_17.json
 create mode 100644 experiments/olivetti_faces/stage1/params.json

diff --git a/TODO.md b/TODO.md
index 5ea6cc5..b94e576 100644
--- a/TODO.md
+++ b/TODO.md
@@ -1,7 +1,3 @@
-* Fix pickle loading of ModelRawResults, because saving the model_object leads import issues.
-* Fix ModelFactory.load function.
 * Fix model results loading in compute_results.py.
 * Check that omp multiclasses classifier is working as expected.
-* In the bayesian search computation, output a different file name depending on the task of the trained model.
-* Check the best params scores of the regressors (neg_mean_squared_error leads to huge negative values).
-* Prepare the json experiment files to run.
\ No newline at end of file
+* Fix the dataset error of fetcher when job_number > 1.
\ No newline at end of file
diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py
index 01d71e2..8ffbc76 100644
--- a/code/bolsonaro/data/dataset_loader.py
+++ b/code/bolsonaro/data/dataset_loader.py
@@ -19,7 +19,7 @@ class DatasetLoader(object):
     DEFAULT_NORMALIZE_D = False
     DEFAULT_DATASET_NORMALIZER = 'standard'
     DEFAULT_FOREST_SIZE = 100
-    DEFAULT_EXTRACTED_FOREST_SIZE = 10
+    DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 4
     DEFAULT_DEV_SIZE = 0.2
     DEFAULT_TEST_SIZE = 0.2
     DEFAULT_RANDOM_SEED_NUMBER = 1
diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py
index 9dff06a..797f300 100644
--- a/code/bolsonaro/utils.py
+++ b/code/bolsonaro/utils.py
@@ -79,7 +79,6 @@ def change_binary_func_load(base_load_function):
         return X, y
     return func_load
 
-
 @contextlib.contextmanager
 def tqdm_joblib(tqdm_object):
     """Context manager to patch joblib to report into tqdm progress bar given as argument"""
@@ -100,3 +99,17 @@ def tqdm_joblib(tqdm_object):
     finally:
         joblib.parallel.BatchCompletionCallBack = old_batch_callback
         tqdm_object.close()    
+
+def is_int(value):
+    try:
+        int(value)
+        return True
+    except ValueError:
+        return False
+
+def is_float(value):
+    try:
+        float(value)
+        return True
+    except ValueError:
+        return False
diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py
index 548a1a4..9135c00 100644
--- a/code/compute_hyperparameters.py
+++ b/code/compute_hyperparameters.py
@@ -4,7 +4,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters
 from bolsonaro.data.task import Task
 from bolsonaro.error_handling.logger_factory import LoggerFactory
 from bolsonaro.hyperparameter_searcher import HyperparameterSearcher
-from bolsonaro.utils import save_obj_to_json, tqdm_joblib
+from bolsonaro.utils import save_obj_to_json, tqdm_joblib, is_int, is_float
 
 import argparse
 import os
@@ -68,7 +68,7 @@ def process_job(dataset_name, seed, param_space, args):
 def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args):
     # Run one hyperparameter search job per seed
     with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
-        opt_results = Parallel(n_jobs=-1)(delayed(process_job)(
+        opt_results = Parallel(n_jobs=args.job_number)(delayed(process_job)(
             dataset_name, seeds[i], param_space, args) for i in range(len(seeds)))
     return opt_results
 
@@ -108,6 +108,10 @@ def compute_best_params_over_seeds(seeds, dataset_name, param_space, args):
             split = element.split('_')
             param, value = '_'.join(split[:-1]), split[-1]
             if param not in best_params:
+                if is_int(value):
+                    value = int(value)
+                elif is_float(value):
+                    value = float(value)
                 best_params[param] = value
         if len(best_params) == len(all_param_names):
             break
@@ -128,6 +132,7 @@ if __name__ == "__main__":
     DEFAULT_CV = 3
     DEFAULT_N_ITER = 50
     DEFAULT_VERBOSE = False
+    DEFAULT_JOB_NUMBER = -1
     DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000),
                         'min_samples_leaf': Integer(1, 1000),
                         'max_depth': Integer(1, 20),
@@ -144,6 +149,7 @@ if __name__ == "__main__":
     parser.add_argument('--use_variable_seed_number', action='store_true', default=DEFAULT_USE_VARIABLE_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.')
     parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.')
     parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
+    parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
     args = parser.parse_args()
 
     logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
diff --git a/code/compute_results.py b/code/compute_results.py
index 64124af..5e65a77 100644
--- a/code/compute_results.py
+++ b/code/compute_results.py
@@ -16,147 +16,31 @@ if __name__ == "__main__":
 
     DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
     DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
-    DEFAULT_EXPERIMENT_IDS = None
 
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--stage_number', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].')
+    parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).')
     parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
     parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
-    parser.add_argument('--experiment_ids', nargs='+', type=int, default=DEFAULT_EXPERIMENT_IDS, help='Compute the results of the specified experiment id(s)')
     args = parser.parse_args()
 
+    if int(args.stage_number) not in list(range(1, 5)):
+        raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).')
+
     # Create recursively the results dir tree
     pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
 
-    """
-    Use specified list of experiments ids if availabe.
-    Otherwise, list all existing experiment ids from
-    the specified models directory.
-    """
-    experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \
-        if args.experiment_ids is not None \
-        else os.listdir(args.models_dir)
+    
 
     """
-    Raise an error if there's no experiments ids found both
-    in parameter or in models directory.
+    TODO:
+    For each dataset:
+    Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
+    Stage 2) A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
+    Stage 3) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
+    Stage 4) A figure to finally compare the perf of our approach using the previous selected
+        parameters vs the baseline vs other papers using different extracted forest size
+        (percentage of the tree size found previously in best hyperparams search) on the abscissa.
+
+    IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1).
     """
-    if experiments_ids is None or len(experiments_ids) == 0:
-        raise ValueError("No experiment id was found or specified.")
-
-    # Compute the plots for each experiment id
-    for experiment_id in experiments_ids:
-        experiment_id_path = args.models_dir + os.sep + experiment_id # models/{experiment_id}
-        # Create recursively the tree results/{experiment_id}
-        pathlib.Path(args.results_dir + os.sep + experiment_id).mkdir(parents=True, exist_ok=True)
-        experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
-
-        """
-        Dictionaries to temporarly store the scalar results with the following structure:
-        {seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]}
-        TODO: to complete to retreive more results
-        """
-        experiment_train_scores = dict()
-        experiment_dev_scores = dict()
-        experiment_test_scores = dict()
-
-        experiment_weights = dict()
-
-        # Used to check if all losses were computed using the same metric (it should be the case)
-        experiment_score_metrics = list()
-
-        # For each seed results stored in models/{experiment_id}/seeds
-        for seed in os.listdir(experiment_seed_root_path):
-            experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
-            dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed
-            dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters
-            extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size
-
-            # {{seed}:[]}
-            experiment_train_scores[seed] = list()
-            experiment_dev_scores[seed] = list()
-            experiment_test_scores[seed] = list()
-
-            experiment_weights[seed] = list()
-
-            # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size
-            extracted_forest_sizes = os.listdir(extracted_forest_size_root_path)
-            for extracted_forest_size in extracted_forest_sizes:
-                # models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}
-                extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size
-                # Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file
-                model_raw_results = ModelRawResults.load(extracted_forest_size_path)
-                # Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle
-                model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results)
-                # Save temporarly some raw results (TODO: to complete to retreive more results)
-                # Save the scores
-                experiment_train_scores[seed].append(model_raw_results.train_score)
-                experiment_dev_scores[seed].append(model_raw_results.dev_score)
-                experiment_test_scores[seed].append(model_raw_results.test_score)
-                # Save the weights
-                experiment_weights[seed].append(model_raw_results.weights)
-                # Save the metric
-                experiment_score_metrics.append(model_raw_results.score_metric)
-
-        if len(set(experiment_score_metrics)) > 1:
-            raise ValueError("The metrics used to compute the dev score aren't the same everytime")
-
-        """
-        Example of plot that just plots the losses computed
-        on the train, dev and test subsets using a trained
-        model, with the CI, and depending on the extracted
-        forest size.
-        """
-        Plotter.plot_losses(
-            file_path=args.results_dir + os.sep + experiment_id + os.sep + 'losses.png',
-            all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores],
-            x_value=extracted_forest_sizes,
-            xlabel='Number of trees extracted',
-            ylabel=experiment_score_metrics[0],
-            all_labels=['train', 'dev', 'test'],
-            title='Loss values of the trained model'
-        )
-
-        """
-        TODO:
-        For each dataset:
-        Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
-        Stage 2) A figure for the selection of the best dataset normalization method
-        Stage 3) A figure for the selection of the best combination of dataset: normalization vs D normalization vs weights normalization
-        Stage 4) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
-        Stage 5) A figure for the selection of the best extracted forest size?
-        Stage 6) A figure to finally compare the perf of our approach using the previous selected parameters vs the baseline vs other papers
-
-        Stage 3)
-        In all axis:
-        - untrained forest
-        - trained base forest (straight line cause it doesn't depend on the number of extracted trees)
-
-        Axis 1:
-        - test with forest on train+dev and OMP on train+dev
-        - test with forest on train+dev and OMP on train+dev with dataset normalization
-        - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization
-        - test with forest on train+dev and OMP on train+dev with dataset normalization + weights normalization
-        - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + weights normalization
-
-        Axis 2:
-        - test with forest on train and OMP on dev
-        - test with forest on train and OMP on dev with dataset normalization
-        - test with forest on train and OMP on dev with dataset normalization + D normalization
-        - test with forest on train and OMP on dev with dataset normalization + weights normalization
-        - test with forest on train and OMP on dev with dataset normalization + D normalization + weights normalization
-
-        Axis 3:
-        - test with forest on train and OMP train+dev
-        - test with forest on train and OMP train+dev with dataset normalization
-        - test with forest on train and OMP train+dev with dataset normalization + D normalization
-        - test with forest on train and OMP train+dev with dataset normalization + weights normalization
-        - test with forest on train and OMP train+dev with dataset normalization + D normalization + weights normalization
-
-        IMPORTANT: Same seeds used in all axis.
-        """
-
-        # Plot the density of the weights
-        Plotter.weight_density(
-            file_path=args.results_dir + os.sep + experiment_id + os.sep + 'density_weight.png',
-            all_experiment_weights=experiment_weights
-        )
diff --git a/code/train.py b/code/train.py
index 38f2887..13216ac 100644
--- a/code/train.py
+++ b/code/train.py
@@ -17,6 +17,7 @@ from joblib import Parallel, delayed
 import threading
 import json
 from tqdm import tqdm
+import numpy as np
 
 
 def process_job(seed, parameters, experiment_id, hyperparameters):
@@ -82,6 +83,8 @@ if __name__ == "__main__":
     # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
     DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
     DEFAULT_VERBOSE = False
+    DEFAULT_SKIP_BEST_HYPERPARAMS = False
+    DEFAULT_JOB_NUMBER = -1
 
     begin_random_seed_range = 1
     end_random_seed_range = 2000
@@ -92,8 +95,8 @@ if __name__ == "__main__":
     parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
     parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
     parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
-    parser.add_argument('--forest_size', nargs='?', type=int, default=DatasetLoader.DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.')
-    parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.')
+    parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
+    parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
     parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
     parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
     parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
@@ -102,6 +105,9 @@ if __name__ == "__main__":
     parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
     parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
     parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
+    parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
+    parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
+    parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
     args = parser.parse_args()
 
     if args.experiment_configuration:
@@ -115,22 +121,31 @@ if __name__ == "__main__":
 
     logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
 
-    # The number of tree to extract from forest (K)
-    parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \
-        if type(parameters['extracted_forest_size']) == list \
-        else [parameters['extracted_forest_size']]
-
     hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
-    if os.path.exists(hyperparameters_path):
+    if os.path.exists(hyperparameters_path) and not args.skip_best_hyperparams:
         logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
         with open(hyperparameters_path, 'r+') as file_hyperparameter:
             hyperparameters = json.load(file_hyperparameter)['best_parameters']
     else:
         hyperparameters = {}
 
-    if parameters['forest_size'] is not None:
+    """
+    First case: no best hyperparameters are specified and no forest_size parameter
+    specified in argument, so use the DEFAULT_FOREST_SIZE.
+    Second case: no matter if hyperparameters are specified, the forest_size parameter
+    will override it.
+    Third implicit case: use the number of estimators found in specified hyperparameters.
+    """
+    if len(hyperparameters) == 0 and parameters['forest_size'] is None:
+        hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE
+    elif parameters['forest_size'] is not None:
         hyperparameters['n_estimators'] = parameters['forest_size']
 
+    # The number of tree to extract from forest (K)
+    parameters['extracted_forest_size'] = [int(hyperparameters['n_estimators'] * coeff) \
+         for coeff in np.linspace(0, 1, parameters['extracted_forest_size_samples'] + 1,
+         endpoint=False)[1:]]
+
     if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
         logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')    
 
@@ -142,15 +157,29 @@ if __name__ == "__main__":
     # Resolve the next experiment id number (last id + 1)
     experiment_id = resolve_experiment_id(parameters['models_dir'])
     logger.info('Experiment id: {}'.format(experiment_id))
+    parameters['experiment_id'] = experiment_id
 
     """
     If the experiment configuration isn't coming from
     an already existing file, save it to a json file to
-    keep trace of it.
+    keep trace of it (either a specified path, either in 'unnamed' dir.).
     """
     if args.experiment_configuration is None:
-        with open(args.experiment_configuration_path + os.sep + 'unnamed' + os.sep + 'unnamed_{}.json'.format(
-            experiment_id), 'w') as output_file:
+        if args.save_experiment_configuration:
+            if len(args.save_experiment_configuration) != 2:
+                raise ValueError('save_experiment_configuration must have two parameters.')
+            elif int(args.save_experiment_configuration[0]) not in list(range(1, 5)):
+                raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 4]).')
+            output_experiment_configuration_path = os.path.join(args.experiment_configuration_path,
+                args.dataset_name, 'stage' + args.save_experiment_configuration[0],
+                args.save_experiment_configuration[1] + '_{}.json'.format(
+                    experiment_id))
+        else:
+            pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
+            output_experiment_configuration_path = os.path.join(
+                args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format(
+                experiment_id))
+        with open(output_experiment_configuration_path, 'w') as output_file:
             json.dump(
                 parameters,
                 output_file,
@@ -159,5 +188,5 @@ if __name__ == "__main__":
 
     # Run as much job as there are seeds
     with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
-        Parallel(n_jobs=-1)(delayed(process_job)(seeds[i],
+        Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i],
             parameters, experiment_id, hyperparameters) for i in range(len(seeds)))
diff --git a/experiments/boston/stage1/params.json b/experiments/boston/stage1/params.json
index f2f3abb..8b530a1 100644
--- a/experiments/boston/stage1/params.json
+++ b/experiments/boston/stage1/params.json
@@ -4,9 +4,9 @@
     "best_score_test": -13.650326577972058,
     "best_parameters": {
         "max_features": "auto",
-        "min_samples_leaf": "1",
-        "max_depth": "20",
-        "n_estimators": "1000"
+        "min_samples_leaf": 1,
+        "max_depth": 20,
+        "n_estimators": 1000
     },
     "random_seed": [
         1812,
diff --git a/experiments/breast_cancer/stage1/params.json b/experiments/breast_cancer/stage1/params.json
index 43739c2..d2bca84 100644
--- a/experiments/breast_cancer/stage1/params.json
+++ b/experiments/breast_cancer/stage1/params.json
@@ -3,9 +3,9 @@
     "best_score_train": 0.9562271062271059,
     "best_score_test": 0.9514619883040936,
     "best_parameters": {
-        "max_depth": "20",
-        "min_samples_leaf": "1",
-        "n_estimators": "1000",
+        "max_depth": 20,
+        "min_samples_leaf": 1,
+        "n_estimators": 1000,
         "max_features": "log2"
     },
     "random_seed": [
diff --git a/experiments/california_housing/stage1/params.json b/experiments/california_housing/stage1/params.json
new file mode 100644
index 0000000..617c93c
--- /dev/null
+++ b/experiments/california_housing/stage1/params.json
@@ -0,0 +1,16 @@
+{
+    "scorer": "neg_mean_squared_error",
+    "best_score_train": -0.2535049905518054,
+    "best_score_test": -0.24128661227361273,
+    "best_parameters": {
+        "max_features": "log2",
+        "min_samples_leaf": 1,
+        "n_estimators": 1000,
+        "max_depth": 18
+    },
+    "random_seed": [
+        1012,
+        529,
+        42
+    ]
+}
\ No newline at end of file
diff --git a/experiments/diabetes/stage1/params.json b/experiments/diabetes/stage1/params.json
index 472e738..2ade87a 100644
--- a/experiments/diabetes/stage1/params.json
+++ b/experiments/diabetes/stage1/params.json
@@ -4,9 +4,9 @@
     "best_score_test": -3305.635542701523,
     "best_parameters": {
         "max_features": "auto",
-        "min_samples_leaf": "1",
-        "max_depth": "15",
-        "n_estimators": "108"
+        "min_samples_leaf": 1,
+        "max_depth": 15,
+        "n_estimators": 108
     },
     "random_seed": [
         661,
diff --git a/experiments/digits/stage1/params.json b/experiments/digits/stage1/params.json
index 2cf1c4b..845a1ae 100644
--- a/experiments/digits/stage1/params.json
+++ b/experiments/digits/stage1/params.json
@@ -4,9 +4,9 @@
     "best_score_test": 0.9738888888888889,
     "best_parameters": {
         "max_features": "sqrt",
-        "min_samples_leaf": "1",
-        "n_estimators": "1000",
-        "max_depth": "20"
+        "min_samples_leaf": 1,
+        "n_estimators": 1000,
+        "max_depth": 20
     },
     "random_seed": [
         1,
diff --git a/experiments/iris/stage1/params.json b/experiments/iris/stage1/params.json
index a91c658..e3eacac 100644
--- a/experiments/iris/stage1/params.json
+++ b/experiments/iris/stage1/params.json
@@ -4,9 +4,9 @@
     "best_score_test": 0.9155555555555556,
     "best_parameters": {
         "max_features": "sqrt",
-        "min_samples_leaf": "1",
-        "max_depth": "1",
-        "n_estimators": "1000"
+        "min_samples_leaf": 1,
+        "max_depth": 1,
+        "n_estimators": 1000
     },
     "random_seed": [
         771,
diff --git a/experiments/iris/stage1/with_best_params_16.json b/experiments/iris/stage1/with_best_params_16.json
new file mode 100644
index 0000000..102999a
--- /dev/null
+++ b/experiments/iris/stage1/with_best_params_16.json
@@ -0,0 +1,36 @@
+{
+    "experiment_configuration": null,
+    "experiment_configuration_path": "experiments",
+    "dataset_name": "iris",
+    "normalize_D": false,
+    "dataset_normalizer": "standard",
+    "forest_size": null,
+    "extracted_forest_size_samples": 4,
+    "models_dir": ".\\models",
+    "dev_size": 0.2,
+    "test_size": 0.2,
+    "random_seed_number": 1,
+    "seeds": [
+        1,
+        2,
+        3,
+        4,
+        5
+    ],
+    "subsets_used": "train,dev",
+    "normalize_weights": false,
+    "verbose": false,
+    "skip_best_hyperparams": false,
+    "save_experiment_configuration": [
+        "1",
+        "with_best_params"
+    ],
+    "job_number": -1,
+    "extracted_forest_size": [
+        200,
+        400,
+        600,
+        800
+    ],
+    "experiment_id": 16
+}
\ No newline at end of file
diff --git a/experiments/iris/stage1/wo_best_params_17.json b/experiments/iris/stage1/wo_best_params_17.json
new file mode 100644
index 0000000..0294d64
--- /dev/null
+++ b/experiments/iris/stage1/wo_best_params_17.json
@@ -0,0 +1,36 @@
+{
+    "experiment_configuration": null,
+    "experiment_configuration_path": "experiments",
+    "dataset_name": "iris",
+    "normalize_D": false,
+    "dataset_normalizer": "standard",
+    "forest_size": null,
+    "extracted_forest_size_samples": 4,
+    "models_dir": ".\\models",
+    "dev_size": 0.2,
+    "test_size": 0.2,
+    "random_seed_number": 1,
+    "seeds": [
+        1,
+        2,
+        3,
+        4,
+        5
+    ],
+    "subsets_used": "train,dev",
+    "normalize_weights": false,
+    "verbose": false,
+    "skip_best_hyperparams": true,
+    "save_experiment_configuration": [
+        "1",
+        "wo_best_params"
+    ],
+    "job_number": -1,
+    "extracted_forest_size": [
+        20,
+        40,
+        60,
+        80
+    ],
+    "experiment_id": 17
+}
\ No newline at end of file
diff --git a/experiments/linnerud/stage1/params.json b/experiments/linnerud/stage1/params.json
index a1573d3..7e45dc0 100644
--- a/experiments/linnerud/stage1/params.json
+++ b/experiments/linnerud/stage1/params.json
@@ -3,10 +3,10 @@
     "best_score_train": -223.81438159498393,
     "best_score_test": -262.4415311793658,
     "best_parameters": {
-        "max_depth": "1",
-        "min_samples_leaf": "1",
+        "max_depth": 1,
+        "min_samples_leaf": 1,
         "max_features": "sqrt",
-        "n_estimators": "1000"
+        "n_estimators": 1000
     },
     "random_seed": [
         1109,
diff --git a/experiments/olivetti_faces/stage1/params.json b/experiments/olivetti_faces/stage1/params.json
new file mode 100644
index 0000000..c9d83bf
--- /dev/null
+++ b/experiments/olivetti_faces/stage1/params.json
@@ -0,0 +1,28 @@
+{
+    "scorer": "accuracy",
+    "best_score_train": 0.8890625,
+    "best_score_test": 0.89,
+    "best_parameters": {
+        "max_features": "log2",
+        "min_samples_leaf": 1,
+        "n_estimators": 1000,
+        "max_depth": 18
+    },
+    "random_seed": [
+        899,
+        249,
+        1367,
+        942,
+        846,
+        1576,
+        285,
+        839,
+        1974,
+        1216,
+        540,
+        1292,
+        1642,
+        712,
+        1511
+    ]
+}
\ No newline at end of file
diff --git a/experiments/wine/stage1/params.json b/experiments/wine/stage1/params.json
index 99795a6..2ede59c 100644
--- a/experiments/wine/stage1/params.json
+++ b/experiments/wine/stage1/params.json
@@ -3,9 +3,9 @@
     "best_score_train": 0.9846607669616517,
     "best_score_test": 0.9796296296296295,
     "best_parameters": {
-        "max_depth": "20",
-        "min_samples_leaf": "1",
-        "n_estimators": "1000",
+        "max_depth": 20,
+        "min_samples_leaf": 1,
+        "n_estimators": 1000,
         "max_features": "log2"
     },
     "random_seed": [
-- 
GitLab