Skip to content
Snippets Groups Projects

Resolve "Experiment pipeline"

Merged Charly Lamothe requested to merge 12-experiment-pipeline into master
8 files
+ 390
75
Compare changes
  • Side-by-side
  • Inline
Files
8
+ 371
134
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.visualization.plotter import Plotter
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
import argparse
import pathlib
@@ -10,153 +9,391 @@ from dotenv import find_dotenv, load_dotenv
import os
if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env'))
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
DEFAULT_EXPERIMENT_IDS = None
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--experiment_ids', nargs='+', type=int, default=DEFAULT_EXPERIMENT_IDS, help='Compute the results of the specified experiment id(s)')
args = parser.parse_args()
# Create recursively the results dir tree
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
"""
Use specified list of experiments ids if availabe.
Otherwise, list all existing experiment ids from
the specified models directory.
"""
experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \
if args.experiment_ids is not None \
else os.listdir(args.models_dir)
def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
seed = os.listdir(experiment_seed_root_path)[0]
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes'
return len(os.listdir(extracted_forest_sizes_root_path))
"""
Raise an error if there's no experiments ids found both
in parameter or in models directory.
"""
if experiments_ids is None or len(experiments_ids) == 0:
raise ValueError("No experiment id was found or specified.")
# Compute the plots for each experiment id
for experiment_id in experiments_ids:
experiment_id_path = args.models_dir + os.sep + experiment_id # models/{experiment_id}
# Create recursively the tree results/{experiment_id}
pathlib.Path(args.results_dir + os.sep + experiment_id).mkdir(parents=True, exist_ok=True)
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
"""
Dictionaries to temporarly store the scalar results with the following structure:
{seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]}
TODO: to complete to retreive more results
"""
experiment_train_scores = dict()
experiment_dev_scores = dict()
experiment_test_scores = dict()
experiment_weights = dict()
all_extracted_forest_sizes = list()
# Used to check if all losses were computed using the same metric (it should be the case)
experiment_score_metrics = list()
# For each seed results stored in models/{experiment_id}/seeds
for seed in os.listdir(experiment_seed_root_path):
seeds = os.listdir(experiment_seed_root_path)
seeds.sort(key=int)
for seed in seeds:
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed
dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters
extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' # models/{experiment_id}/seeds/{seed}/forest_size
# {{seed}:[]}
experiment_train_scores[seed] = list()
experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list()
experiment_weights[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size
extracted_forest_sizes = os.listdir(extracted_forest_size_root_path)
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes.sort(key=int)
all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes)))
for extracted_forest_size in extracted_forest_sizes:
# models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}
extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
extracted_forest_size_path = extracted_forest_sizes_root_path + os.sep + extracted_forest_size
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(extracted_forest_size_path)
# Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle
model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results)
# Save temporarly some raw results (TODO: to complete to retreive more results)
# Save the scores
experiment_train_scores[seed].append(model_raw_results.train_score)
experiment_dev_scores[seed].append(model_raw_results.dev_score)
experiment_test_scores[seed].append(model_raw_results.test_score)
# Save the weights
experiment_weights[seed].append(model_raw_results.weights)
# Save the metric
experiment_score_metrics.append(model_raw_results.score_metric)
# Sanity checks
if len(set(experiment_score_metrics)) > 1:
raise ValueError("The metrics used to compute the dev score aren't the same everytime")
raise ValueError("The metrics used to compute the scores aren't the sames across seeds.")
if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1:
raise ValueError("The extracted forest sizes aren't the sames across seeds.")
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, all_extracted_forest_sizes[0], experiment_score_metrics[0]
def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
"""
Example of plot that just plots the losses computed
on the train, dev and test subsets using a trained
model, with the CI, and depending on the extracted
forest size.
Dictionaries to temporarly store the scalar results with the following structure:
{seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]}
"""
Plotter.plot_losses(
file_path=args.results_dir + os.sep + experiment_id + os.sep + 'losses.png',
all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores],
x_value=extracted_forest_sizes,
experiment_train_scores = dict()
experiment_dev_scores = dict()
experiment_test_scores = dict()
# Used to check if all losses were computed using the same metric (it should be the case)
experiment_score_metrics = list()
# For each seed results stored in models/{experiment_id}/seeds
seeds = os.listdir(experiment_seed_root_path)
seeds.sort(key=int)
for seed in seeds:
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
forest_size_root_path = experiment_seed_path + os.sep + 'forest_size' # models/{experiment_id}/seeds/{seed}/forest_size
# {{seed}:[]}
experiment_train_scores[seed] = list()
experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list()
forest_size = os.listdir(forest_size_root_path)[0]
# models/{experiment_id}/seeds/{seed}/forest_size/{forest_size}
forest_size_path = forest_size_root_path + os.sep + forest_size
# Load models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}/model_raw_results.pickle file
model_raw_results = ModelRawResults.load(forest_size_path)
for _ in range(extracted_forest_sizes_number):
# Save the scores
experiment_train_scores[seed].append(model_raw_results.train_score)
experiment_dev_scores[seed].append(model_raw_results.dev_score)
experiment_test_scores[seed].append(model_raw_results.test_score)
# Save the metric
experiment_score_metrics.append(model_raw_results.score_metric)
if len(set(experiment_score_metrics)) > 1:
raise ValueError("The metrics used to compute the scores aren't the same everytime")
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, experiment_score_metrics[0]
if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env'))
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].')
parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \
'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}' + \
'stage=2: {{no_normalization}} {{normalize_D}} {{normalize_weights}} {{normalize_D_and_weights}}' + \
'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}')
parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
args = parser.parse_args()
if args.stage not in list(range(1, 6)):
raise ValueError('stage must be a supported stage id (i.e. [1, 5]).')
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
logger.info('Compute results of with stage:{} - experiment_ids:{} - dataset_name:{} - results_dir:{} - models_dir:{}'.format(
args.stage, args.experiment_ids, args.dataset_name, args.results_dir, args.models_dir))
# Create recursively the results dir tree
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
if args.stage == 1:
if len(args.experiment_ids) != 6:
raise ValueError('In the case of stage 1, the number of specified experiment ids must be 6.')
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
# Experiments that used the best hyperparameters found for this dataset
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
# Experiments that didn't use the best hyperparameters found for this dataset
# base_wo_params
logger.info('Loading base_wo_params experiment scores...')
base_wo_params_train_scores, base_wo_params_dev_scores, base_wo_params_test_scores, \
base_wo_params_experiment_score_metric = extract_scores_across_seeds_and_forest_size(
args.models_dir, args.results_dir, args.experiment_ids[3],
extracted_forest_sizes_number)
# random_wo_params
logger.info('Loading random_wo_params experiment scores...')
random_wo_params_train_scores, random_wo_params_dev_scores, random_wo_params_test_scores, \
wo_params_extracted_forest_sizes, random_wo_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[4])
# base_wo_params
logger.info('Loading base_wo_params experiment scores...')
omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _, \
omp_wo_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[5])
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric ==
omp_with_params_experiment_score_metric == base_wo_params_experiment_score_metric ==
random_wo_params_experiment_score_metric ==
omp_wo_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage1')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
"""all_experiment_scores_with_params=[base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores,
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores,
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores],
all_experiment_scores_wo_params=[base_wo_params_train_scores, base_wo_params_dev_scores, base_wo_params_test_scores,
random_wo_params_train_scores, random_wo_params_dev_scores, random_wo_params_test_scores,
omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores],
all_labels=['base_with_params_train', 'base_with_params_dev', 'base_with_params_test',
'random_with_params_train', 'random_with_params_dev', 'random_with_params_test',
'omp_with_params_train', 'omp_with_params_dev', 'omp_with_params_test'],"""
Plotter.plot_stage1_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores_with_params=[base_with_params_test_scores,
random_with_params_test_scores,
omp_with_params_test_scores],
all_experiment_scores_wo_params=[base_wo_params_test_scores,
random_wo_params_test_scores,
omp_wo_params_test_scores],
all_labels=['base', 'random', 'omp'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiment_score_metrics[0],
all_labels=['train', 'dev', 'test'],
title='Loss values of the trained model'
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best and default hyperparameters'.format(args.dataset_name)
)
elif args.stage == 2:
if len(args.experiment_ids) != 4:
raise ValueError('In the case of stage 2, the number of specified experiment ids must be 4.')
# no_normalization
logger.info('Loading no_normalization experiment scores...')
_, _, no_normalization_test_scores, extracted_forest_sizes, no_normalization_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[0])
# normalize_D
logger.info('Loading normalize_D experiment scores...')
_, _, normalize_D_test_scores, _, normalize_D_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[1])
# normalize_weights
logger.info('Loading normalize_weights experiment scores...')
_, _, normalize_weights_test_scores, _, normalize_weights_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[2])
# normalize_D_and_weights
logger.info('Loading normalize_D_and_weights experiment scores...')
_, _, normalize_D_and_weights_test_scores, _, normalize_D_and_weights_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[3])
# Sanity check on the metrics retreived
if not (no_normalization_experiment_score_metric == normalize_D_experiment_score_metric
== normalize_weights_experiment_score_metric == normalize_D_and_weights_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = no_normalization_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage2')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[no_normalization_test_scores, normalize_D_test_scores,
normalize_weights_test_scores, normalize_D_and_weights_test_scores],
all_labels=['no_normalization', 'normalize_D', 'normalize_weights', 'normalize_D_and_weights'],
x_value=extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing different normalizations'.format(args.dataset_name))
elif args.stage == 3:
if len(args.experiment_ids) != 3:
raise ValueError('In the case of stage 3, the number of specified experiment ids must be 3.')
# train-dev_subset
logger.info('Loading train-dev_subset experiment scores...')
train_dev_subset_train_scores, train_dev_subset_dev_scores, train_dev_subset_test_scores, \
extracted_forest_sizes, train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[0])
# train-dev_train-dev_subset
logger.info('Loading train-dev_train-dev_subset experiment scores...')
train_dev_train_dev_subset_train_scores, train_dev_train_dev_subset_dev_scores, train_dev_train_dev_subset_test_scores, \
_, train_dev_train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[1])
# train-train-dev_subset
logger.info('Loading train-train-dev_subset experiment scores...')
train_train_dev_subset_train_scores, train_train_dev_subset_dev_scores, train_train_dev_subset_test_scores, \
_, train_train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[2])
# Sanity check on the metrics retreived
if not (train_dev_subset_experiment_score_metric == train_dev_train_dev_subset_experiment_score_metric
== train_train_dev_subset_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = train_dev_subset_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage3')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[train_dev_subset_test_scores, train_dev_train_dev_subset_test_scores,
train_train_dev_subset_test_scores],
all_labels=['train,dev', 'train+dev,train+dev', 'train,train+dev'],
x_value=extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing different training subsets'.format(args.dataset_name))
"""Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[train_dev_subset_train_scores, train_train_dev_subset_train_scores,
train_train_dev_subset_train_scores, train_dev_subset_dev_scores, train_dev_train_dev_subset_dev_scores,
train_train_dev_subset_dev_scores, train_dev_subset_test_scores, train_dev_train_dev_subset_test_scores,
train_train_dev_subset_test_scores],
all_labels=['train,dev - train', 'train+dev,train+dev - train', 'train,train+dev - train',
'train,dev - dev', 'train+dev,train+dev - dev', 'train,train+dev - dev',
'train,dev - test', 'train+dev,train+dev - test', 'train,train+dev - test'],
x_value=extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing different training subsets'.format(args.dataset_name))"""
elif args.stage == 4:
if len(args.experiment_ids) != 3:
raise ValueError('In the case of stage 4, the number of specified experiment ids must be 3.')
# Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary
extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1])
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
"""# base_with_params
logger.info('Loading base_with_params experiment scores 2...')
_, _, base_with_params_test_scores_2, \
_ = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[3],
extracted_forest_sizes_number)
# random_with_params
logger.info('Loading random_with_params experiment scores 2...')
_, _, random_with_params_test_scores_2, \
_, _ = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[4])"""
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric
== omp_with_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage4')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores],
all_labels=['base', 'random', 'omp'],
x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name))
else:
raise ValueError('This stage number is not supported yet, but it will be!')
logger.info('Done.')
"""
TODO:
For each dataset:
Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) A figure for the selection of the best dataset normalization method
Stage 3) A figure for the selection of the best combination of dataset: normalization vs D normalization vs weights normalization
Stage 4) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 5) A figure for the selection of the best extracted forest size?
Stage 6) A figure to finally compare the perf of our approach using the previous selected parameters vs the baseline vs other papers
Stage 3)
In all axis:
- untrained forest
- trained base forest (straight line cause it doesn't depend on the number of extracted trees)
Axis 1:
- test with forest on train+dev and OMP on train+dev
- test with forest on train+dev and OMP on train+dev with dataset normalization
- test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization
- test with forest on train+dev and OMP on train+dev with dataset normalization + weights normalization
- test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + weights normalization
Axis 2:
- test with forest on train and OMP on dev
- test with forest on train and OMP on dev with dataset normalization
- test with forest on train and OMP on dev with dataset normalization + D normalization
- test with forest on train and OMP on dev with dataset normalization + weights normalization
- test with forest on train and OMP on dev with dataset normalization + D normalization + weights normalization
Axis 3:
- test with forest on train and OMP train+dev
- test with forest on train and OMP train+dev with dataset normalization
- test with forest on train and OMP train+dev with dataset normalization + D normalization
- test with forest on train and OMP train+dev with dataset normalization + weights normalization
- test with forest on train and OMP train+dev with dataset normalization + D normalization + weights normalization
IMPORTANT: Same seeds used in all axis.
"""
Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 4) A figure to finally compare the perf of our approach using the previous selected
parameters vs the baseline vs other papers using different extracted forest size
(percentage of the tree size found previously in best hyperparams search) on the abscissa.
# Plot the density of the weights
Plotter.weight_density(
file_path=args.results_dir + os.sep + experiment_id + os.sep + 'density_weight.png',
all_experiment_weights=experiment_weights
)
IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1).
"""
Loading