From fd6dbc7bb72c2a00ba9d83c4b65f4ed9db028c4d Mon Sep 17 00:00:00 2001
From: Charly Lamothe <charly.lamothe@univ-amu.fr>
Date: Wed, 18 Dec 2019 03:01:57 +0100
Subject: [PATCH] POC of possible wrong way to compute best hyperparams. Are
 there the best only before the application of OMP extraction?

---
 code/bolsonaro/data/dataset_loader.py         |  3 +-
 code/compute_results.py                       | 70 ++++++++++++++++++-
 code/train.py                                 |  8 ++-
 .../iris/stage1/with_best_params_16.json      | 36 ----------
 .../iris/stage1/wo_best_params_17.json        | 36 ----------
 5 files changed, 75 insertions(+), 78 deletions(-)
 delete mode 100644 experiments/iris/stage1/with_best_params_16.json
 delete mode 100644 experiments/iris/stage1/wo_best_params_17.json

diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py
index 8ffbc76..6f85998 100644
--- a/code/bolsonaro/data/dataset_loader.py
+++ b/code/bolsonaro/data/dataset_loader.py
@@ -19,7 +19,8 @@ class DatasetLoader(object):
     DEFAULT_NORMALIZE_D = False
     DEFAULT_DATASET_NORMALIZER = 'standard'
     DEFAULT_FOREST_SIZE = 100
-    DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 4
+    DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 10
+    DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.3
     DEFAULT_DEV_SIZE = 0.2
     DEFAULT_TEST_SIZE = 0.2
     DEFAULT_RANDOM_SEED_NUMBER = 1
diff --git a/code/compute_results.py b/code/compute_results.py
index 5e65a77..3ff7a66 100644
--- a/code/compute_results.py
+++ b/code/compute_results.py
@@ -24,13 +24,79 @@ if __name__ == "__main__":
     parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
     args = parser.parse_args()
 
-    if int(args.stage_number) not in list(range(1, 5)):
+    if args.stage_number not in list(range(1, 5)):
         raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).')
 
     # Create recursively the results dir tree
     pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
 
-    
+    if args.stage_number == 1:
+        for experiment_id in args.experiment_ids:
+            experiment_id_path = args.models_dir + os.sep + str(experiment_id) # models/{experiment_id}
+            # Create recursively the tree results/{experiment_id}
+            pathlib.Path(args.results_dir + os.sep + str(experiment_id)).mkdir(parents=True, exist_ok=True)
+            experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
+
+            """
+            Dictionaries to temporarly store the scalar results with the following structure:
+            {seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]}
+            TODO: to complete to retreive more results
+            """
+            experiment_train_scores = dict()
+            experiment_dev_scores = dict()
+            experiment_test_scores = dict()
+
+            # Used to check if all losses were computed using the same metric (it should be the case)
+            experiment_score_metrics = list()
+
+            # For each seed results stored in models/{experiment_id}/seeds
+            for seed in os.listdir(experiment_seed_root_path):
+                experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
+                dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed
+                dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters
+                extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size
+
+                # {{seed}:[]}
+                experiment_train_scores[seed] = list()
+                experiment_dev_scores[seed] = list()
+                experiment_test_scores[seed] = list()
+
+                # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size
+                extracted_forest_sizes = os.listdir(extracted_forest_size_root_path)
+                extracted_forest_sizes.sort(key=int)
+                for extracted_forest_size in extracted_forest_sizes:
+                    # models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}
+                    extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size
+                    # Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file
+                    model_raw_results = ModelRawResults.load(extracted_forest_size_path)
+                    # Save temporarly some raw results (TODO: to complete to retreive more results)
+                    # Save the scores
+                    experiment_train_scores[seed].append(model_raw_results.train_score)
+                    experiment_dev_scores[seed].append(model_raw_results.dev_score)
+                    experiment_test_scores[seed].append(model_raw_results.test_score)
+                    # Save the metric
+                    experiment_score_metrics.append(model_raw_results.score_metric)
+
+            if len(set(experiment_score_metrics)) > 1:
+                raise ValueError("The metrics used to compute the dev score aren't the same everytime")
+
+            """
+            Example of plot that just plots the losses computed
+            on the train, dev and test subsets using a trained
+            model, with the CI, and depending on the extracted
+            forest size.
+            """
+            Plotter.plot_losses(
+                file_path=args.results_dir + os.sep + str(experiment_id) + os.sep + 'losses.png',
+                all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores],
+                x_value=extracted_forest_sizes,
+                xlabel='Number of trees extracted',
+                ylabel=experiment_score_metrics[0],
+                all_labels=['train', 'dev', 'test'],
+                title='Loss values of the trained model'
+            )
+    else:
+        raise ValueError('This stage number is not supported yet, but it will be!')
 
     """
     TODO:
diff --git a/code/train.py b/code/train.py
index 13216ac..81d430e 100644
--- a/code/train.py
+++ b/code/train.py
@@ -97,6 +97,7 @@ if __name__ == "__main__":
     parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
     parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
     parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
+    parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
     parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
     parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
     parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
@@ -142,9 +143,10 @@ if __name__ == "__main__":
         hyperparameters['n_estimators'] = parameters['forest_size']
 
     # The number of tree to extract from forest (K)
-    parameters['extracted_forest_size'] = [int(hyperparameters['n_estimators'] * coeff) \
-         for coeff in np.linspace(0, 1, parameters['extracted_forest_size_samples'] + 1,
-         endpoint=False)[1:]]
+    parameters['extracted_forest_size'] = (hyperparameters['n_estimators'] *
+        np.linspace(0, args.extracted_forest_size_stop,
+        parameters['extracted_forest_size_samples'] + 1,
+        endpoint=False)[1:]).astype(np.int).tolist()
 
     if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
         logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')    
diff --git a/experiments/iris/stage1/with_best_params_16.json b/experiments/iris/stage1/with_best_params_16.json
deleted file mode 100644
index 102999a..0000000
--- a/experiments/iris/stage1/with_best_params_16.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "experiment_configuration": null,
-    "experiment_configuration_path": "experiments",
-    "dataset_name": "iris",
-    "normalize_D": false,
-    "dataset_normalizer": "standard",
-    "forest_size": null,
-    "extracted_forest_size_samples": 4,
-    "models_dir": ".\\models",
-    "dev_size": 0.2,
-    "test_size": 0.2,
-    "random_seed_number": 1,
-    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
-    ],
-    "subsets_used": "train,dev",
-    "normalize_weights": false,
-    "verbose": false,
-    "skip_best_hyperparams": false,
-    "save_experiment_configuration": [
-        "1",
-        "with_best_params"
-    ],
-    "job_number": -1,
-    "extracted_forest_size": [
-        200,
-        400,
-        600,
-        800
-    ],
-    "experiment_id": 16
-}
\ No newline at end of file
diff --git a/experiments/iris/stage1/wo_best_params_17.json b/experiments/iris/stage1/wo_best_params_17.json
deleted file mode 100644
index 0294d64..0000000
--- a/experiments/iris/stage1/wo_best_params_17.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-    "experiment_configuration": null,
-    "experiment_configuration_path": "experiments",
-    "dataset_name": "iris",
-    "normalize_D": false,
-    "dataset_normalizer": "standard",
-    "forest_size": null,
-    "extracted_forest_size_samples": 4,
-    "models_dir": ".\\models",
-    "dev_size": 0.2,
-    "test_size": 0.2,
-    "random_seed_number": 1,
-    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
-    ],
-    "subsets_used": "train,dev",
-    "normalize_weights": false,
-    "verbose": false,
-    "skip_best_hyperparams": true,
-    "save_experiment_configuration": [
-        "1",
-        "wo_best_params"
-    ],
-    "job_number": -1,
-    "extracted_forest_size": [
-        20,
-        40,
-        60,
-        80
-    ],
-    "experiment_id": 17
-}
\ No newline at end of file
-- 
GitLab