From fd6dbc7bb72c2a00ba9d83c4b65f4ed9db028c4d Mon Sep 17 00:00:00 2001 From: Charly Lamothe <charly.lamothe@univ-amu.fr> Date: Wed, 18 Dec 2019 03:01:57 +0100 Subject: [PATCH] POC of possible wrong way to compute best hyperparams. Are there the best only before the application of OMP extraction? --- code/bolsonaro/data/dataset_loader.py | 3 +- code/compute_results.py | 70 ++++++++++++++++++- code/train.py | 8 ++- .../iris/stage1/with_best_params_16.json | 36 ---------- .../iris/stage1/wo_best_params_17.json | 36 ---------- 5 files changed, 75 insertions(+), 78 deletions(-) delete mode 100644 experiments/iris/stage1/with_best_params_16.json delete mode 100644 experiments/iris/stage1/wo_best_params_17.json diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index 8ffbc76..6f85998 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -19,7 +19,8 @@ class DatasetLoader(object): DEFAULT_NORMALIZE_D = False DEFAULT_DATASET_NORMALIZER = 'standard' DEFAULT_FOREST_SIZE = 100 - DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 4 + DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 10 + DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.3 DEFAULT_DEV_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 diff --git a/code/compute_results.py b/code/compute_results.py index 5e65a77..3ff7a66 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -24,13 +24,79 @@ if __name__ == "__main__": parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') args = parser.parse_args() - if int(args.stage_number) not in list(range(1, 5)): + if args.stage_number not in list(range(1, 5)): raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).') # Create recursively the results dir tree pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) - + if args.stage_number == 1: + for experiment_id in args.experiment_ids: + experiment_id_path = args.models_dir + os.sep + str(experiment_id) # models/{experiment_id} + # Create recursively the tree results/{experiment_id} + pathlib.Path(args.results_dir + os.sep + str(experiment_id)).mkdir(parents=True, exist_ok=True) + experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds + + """ + Dictionaries to temporarly store the scalar results with the following structure: + {seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]} + TODO: to complete to retreive more results + """ + experiment_train_scores = dict() + experiment_dev_scores = dict() + experiment_test_scores = dict() + + # Used to check if all losses were computed using the same metric (it should be the case) + experiment_score_metrics = list() + + # For each seed results stored in models/{experiment_id}/seeds + for seed in os.listdir(experiment_seed_root_path): + experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed} + dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed + dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters + extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size + + # {{seed}:[]} + experiment_train_scores[seed] = list() + experiment_dev_scores[seed] = list() + experiment_test_scores[seed] = list() + + # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size + extracted_forest_sizes = os.listdir(extracted_forest_size_root_path) + extracted_forest_sizes.sort(key=int) + for extracted_forest_size in extracted_forest_sizes: + # models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size} + extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size + # Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file + model_raw_results = ModelRawResults.load(extracted_forest_size_path) + # Save temporarly some raw results (TODO: to complete to retreive more results) + # Save the scores + experiment_train_scores[seed].append(model_raw_results.train_score) + experiment_dev_scores[seed].append(model_raw_results.dev_score) + experiment_test_scores[seed].append(model_raw_results.test_score) + # Save the metric + experiment_score_metrics.append(model_raw_results.score_metric) + + if len(set(experiment_score_metrics)) > 1: + raise ValueError("The metrics used to compute the dev score aren't the same everytime") + + """ + Example of plot that just plots the losses computed + on the train, dev and test subsets using a trained + model, with the CI, and depending on the extracted + forest size. + """ + Plotter.plot_losses( + file_path=args.results_dir + os.sep + str(experiment_id) + os.sep + 'losses.png', + all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores], + x_value=extracted_forest_sizes, + xlabel='Number of trees extracted', + ylabel=experiment_score_metrics[0], + all_labels=['train', 'dev', 'test'], + title='Loss values of the trained model' + ) + else: + raise ValueError('This stage number is not supported yet, but it will be!') """ TODO: diff --git a/code/train.py b/code/train.py index 13216ac..81d430e 100644 --- a/code/train.py +++ b/code/train.py @@ -97,6 +97,7 @@ if __name__ == "__main__": parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.') parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.') + parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.') parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.') @@ -142,9 +143,10 @@ if __name__ == "__main__": hyperparameters['n_estimators'] = parameters['forest_size'] # The number of tree to extract from forest (K) - parameters['extracted_forest_size'] = [int(hyperparameters['n_estimators'] * coeff) \ - for coeff in np.linspace(0, 1, parameters['extracted_forest_size_samples'] + 1, - endpoint=False)[1:]] + parameters['extracted_forest_size'] = (hyperparameters['n_estimators'] * + np.linspace(0, args.extracted_forest_size_stop, + parameters['extracted_forest_size_samples'] + 1, + endpoint=False)[1:]).astype(np.int).tolist() if parameters['seeds'] != None and parameters['random_seed_number'] > 1: logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') diff --git a/experiments/iris/stage1/with_best_params_16.json b/experiments/iris/stage1/with_best_params_16.json deleted file mode 100644 index 102999a..0000000 --- a/experiments/iris/stage1/with_best_params_16.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "experiment_configuration": null, - "experiment_configuration_path": "experiments", - "dataset_name": "iris", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": null, - "extracted_forest_size_samples": 4, - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 1, - "seeds": [ - 1, - 2, - 3, - 4, - 5 - ], - "subsets_used": "train,dev", - "normalize_weights": false, - "verbose": false, - "skip_best_hyperparams": false, - "save_experiment_configuration": [ - "1", - "with_best_params" - ], - "job_number": -1, - "extracted_forest_size": [ - 200, - 400, - 600, - 800 - ], - "experiment_id": 16 -} \ No newline at end of file diff --git a/experiments/iris/stage1/wo_best_params_17.json b/experiments/iris/stage1/wo_best_params_17.json deleted file mode 100644 index 0294d64..0000000 --- a/experiments/iris/stage1/wo_best_params_17.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "experiment_configuration": null, - "experiment_configuration_path": "experiments", - "dataset_name": "iris", - "normalize_D": false, - "dataset_normalizer": "standard", - "forest_size": null, - "extracted_forest_size_samples": 4, - "models_dir": ".\\models", - "dev_size": 0.2, - "test_size": 0.2, - "random_seed_number": 1, - "seeds": [ - 1, - 2, - 3, - 4, - 5 - ], - "subsets_used": "train,dev", - "normalize_weights": false, - "verbose": false, - "skip_best_hyperparams": true, - "save_experiment_configuration": [ - "1", - "wo_best_params" - ], - "job_number": -1, - "extracted_forest_size": [ - 20, - 40, - 60, - 80 - ], - "experiment_id": 17 -} \ No newline at end of file -- GitLab