Skip to content
Snippets Groups Projects
Commit fd6dbc7b authored by Charly Lamothe's avatar Charly Lamothe
Browse files

POC of possible wrong way to compute best hyperparams. Are there the best only...

POC of possible wrong way to compute best hyperparams. Are there the best only before the application of OMP extraction?
parent 880ff78f
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
......@@ -19,7 +19,8 @@ class DatasetLoader(object):
DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = 'standard'
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 4
DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 10
DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.3
DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
......
......@@ -24,13 +24,79 @@ if __name__ == "__main__":
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
args = parser.parse_args()
if int(args.stage_number) not in list(range(1, 5)):
if args.stage_number not in list(range(1, 5)):
raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).')
# Create recursively the results dir tree
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
if args.stage_number == 1:
for experiment_id in args.experiment_ids:
experiment_id_path = args.models_dir + os.sep + str(experiment_id) # models/{experiment_id}
# Create recursively the tree results/{experiment_id}
pathlib.Path(args.results_dir + os.sep + str(experiment_id)).mkdir(parents=True, exist_ok=True)
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
"""
Dictionaries to temporarly store the scalar results with the following structure:
{seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]}
TODO: to complete to retreive more results
"""
experiment_train_scores = dict()
experiment_dev_scores = dict()
experiment_test_scores = dict()
# Used to check if all losses were computed using the same metric (it should be the case)
experiment_score_metrics = list()
# For each seed results stored in models/{experiment_id}/seeds
for seed in os.listdir(experiment_seed_root_path):
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed
dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters
extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size
# {{seed}:[]}
experiment_train_scores[seed] = list()
experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size
extracted_forest_sizes = os.listdir(extracted_forest_size_root_path)
extracted_forest_sizes.sort(key=int)
for extracted_forest_size in extracted_forest_sizes:
# models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}
extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Save temporarly some raw results (TODO: to complete to retreive more results)
# Save the scores
experiment_train_scores[seed].append(model_raw_results.train_score)
experiment_dev_scores[seed].append(model_raw_results.dev_score)
experiment_test_scores[seed].append(model_raw_results.test_score)
# Save the metric
experiment_score_metrics.append(model_raw_results.score_metric)
if len(set(experiment_score_metrics)) > 1:
raise ValueError("The metrics used to compute the dev score aren't the same everytime")
"""
Example of plot that just plots the losses computed
on the train, dev and test subsets using a trained
model, with the CI, and depending on the extracted
forest size.
"""
Plotter.plot_losses(
file_path=args.results_dir + os.sep + str(experiment_id) + os.sep + 'losses.png',
all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores],
x_value=extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiment_score_metrics[0],
all_labels=['train', 'dev', 'test'],
title='Loss values of the trained model'
)
else:
raise ValueError('This stage number is not supported yet, but it will be!')
"""
TODO:
......
......@@ -97,6 +97,7 @@ if __name__ == "__main__":
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
......@@ -142,9 +143,10 @@ if __name__ == "__main__":
hyperparameters['n_estimators'] = parameters['forest_size']
# The number of tree to extract from forest (K)
parameters['extracted_forest_size'] = [int(hyperparameters['n_estimators'] * coeff) \
for coeff in np.linspace(0, 1, parameters['extracted_forest_size_samples'] + 1,
endpoint=False)[1:]]
parameters['extracted_forest_size'] = (hyperparameters['n_estimators'] *
np.linspace(0, args.extracted_forest_size_stop,
parameters['extracted_forest_size_samples'] + 1,
endpoint=False)[1:]).astype(np.int).tolist()
if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
......
{
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "iris",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 4,
"models_dir": ".\\models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"1",
"with_best_params"
],
"job_number": -1,
"extracted_forest_size": [
200,
400,
600,
800
],
"experiment_id": 16
}
\ No newline at end of file
{
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "iris",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 4,
"models_dir": ".\\models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": true,
"save_experiment_configuration": [
"1",
"wo_best_params"
],
"job_number": -1,
"extracted_forest_size": [
20,
40,
60,
80
],
"experiment_id": 17
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment