- Add command lines for stage2 experiments;

- Fix possible issues for extracted forest sizes computation: around to reduce possible zeroes and remove duplicates; - Create output experiment stage dir if not exists; - Add base_score_metric to model raw results class; - Add best params for lfw_pairs (maybe try with a larger number of random seeds since the score is not that high).

- Add command lines for stage2 experiments;
58061ea4 · Charly Lamothe · 17d3addc · 58061ea4 · 58061ea4 · 58061ea4
Commit 58061ea4 authored 5 years ago by Charly Lamothe
--- a/code/bolsonaro/models/model_raw_results.py
+++ b/code/bolsonaro/models/model_raw_results.py
@@ -8,8 +8,8 @@ class ModelRawResults(object):
    def __init__(self, model_object, training_time,
        datetime, train_score, dev_score, test_score,
-        score_metric, train_score_base, dev_score_base,
+        train_score_base, dev_score_base,
-        test_score_base):
+        test_score_base, score_metric, base_score_metric):
        self._model_object = model_object
        self._training_time = training_time
@@ -17,10 +17,11 @@ class ModelRawResults(object):
        self._train_score = train_score
        self._dev_score = dev_score
        self._test_score = test_score
-        self._score_metric = score_metric
        self._train_score_base = train_score_base
        self._dev_score_base = dev_score_base
        self._test_score_base = test_score_base
+        self._score_metric = score_metric
+        self._base_score_metric = base_score_metric
    @property
    def model_object(self):
@@ -46,10 +47,6 @@ class ModelRawResults(object):
    def test_score(self):
        return self._test_score
-    @property
-    def score_metric(self):
-        return self._score_metric
    @property
    def train_score_base(self):
        return self._train_score_base
@@ -62,6 +59,14 @@ class ModelRawResults(object):
    def test_score_base(self):
        return self._test_score_base
+    @property
+    def score_metric(self):
+        return self._score_metric
+    @property
+    def base_score_metric(self):
+        return self._base_score_metric
    def save(self, models_dir):
        save_obj_to_pickle(models_dir + os.sep + 'model_raw_results.pickle',
            self.__dict__)

--- a/code/train.py
+++ b/code/train.py
@@ -101,13 +101,20 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
 """
 Example for stage 1:
-python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --save_experiment_configuration 1 none_with_params
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05
-python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --save_experiment_configuration 1 random_with_params
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05
-python code/train.py --dataset_name=california_housing --seeds 1 2 3 --save_experiment_configuration 1 omp_with_params
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05
-python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --forest_size=1000 --extracted_forest_size_stop=0.05
-python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --forest_size=1000 --extracted_forest_size_stop=0.05
-python code/train.py --dataset_name=california_housing --seeds 1 2 3 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --forest_size=1000 --extracted_forest_size_stop=0.05
 python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing --extracted_forest_sizes_number=5
+Example for stage 2:
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05
+python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05
+python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing --extracted_forest_sizes_number=5
 """
 if __name__ == "__main__":
    load_dotenv(find_dotenv('.env'))
@@ -184,10 +191,10 @@ if __name__ == "__main__":
        hyperparameters['n_estimators'] = parameters['forest_size']
    # The number of tree to extract from forest (K)
-    parameters['extracted_forest_size'] = (hyperparameters['n_estimators'] *
+    parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
        np.linspace(0, args.extracted_forest_size_stop,
        parameters['extracted_forest_size_samples'] + 1,
-        endpoint=False)[1:]).astype(np.int).tolist()
+        endpoint=False)[1:]).astype(np.int)).tolist()
    if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
        logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')    
@@ -213,8 +220,10 @@ if __name__ == "__main__":
                raise ValueError('save_experiment_configuration must have two parameters.')
            elif int(args.save_experiment_configuration[0]) not in list(range(1, 5)):
                raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 4]).')
-            output_experiment_configuration_path = os.path.join(args.experiment_configuration_path,
+            output_experiment_stage_path = os.path.join(args.experiment_configuration_path,
-                args.dataset_name, 'stage' + args.save_experiment_configuration[0],
+                args.dataset_name, 'stage' + args.save_experiment_configuration[0])
+            pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True)
+            output_experiment_configuration_path = os.path.join(output_experiment_stage_path,
                args.save_experiment_configuration[1] + '_{}.json'.format(
                    experiment_id))
        else:

--- a/experiments/lfw_pairs/stage1/params.json
+++ b/experiments/lfw_pairs/stage1/params.json
+{
+    "scorer": "accuracy",
+    "best_score_train": 0.6231060606060606,
+    "best_score_test": 0.6174242424242423,
+    "best_parameters": {
+        "min_samples_leaf": 1,
+        "n_estimators": 1000,
+        "max_depth": 16,
+        "max_features": "auto"
+    },
+    "random_seed": [
+        226,
+        674,
+        1639
+    ]
+}
\ No newline at end of file