Skip to content
Snippets Groups Projects
Commit 8de5e96a authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Add code for stages 2 and 3 results;

- Add command lines example for stage 3;
- Add experiment_id option that is useful sometimes;
- Fix subsets_used param;
- Remove experiment_id in config experiment file names;
- Add config experiment files for stages 2 and 3;
- Add results for stages 2 and 3 (california_housing).
parent 58061ea4
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
Showing
with 346 additions and 36 deletions
...@@ -59,6 +59,7 @@ class Plotter(object): ...@@ -59,6 +59,7 @@ class Plotter(object):
@staticmethod @staticmethod
def plot_stage1_losses(file_path, all_experiment_scores_with_params, def plot_stage1_losses(file_path, all_experiment_scores_with_params,
all_experiment_scores_wo_params, x_value, xlabel, ylabel, all_labels, title): all_experiment_scores_wo_params, x_value, xlabel, ylabel, all_labels, title):
fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True) fig, axes = plt.subplots(nrows=1, ncols=2, sharey=True)
n = len(all_experiment_scores_with_params) n = len(all_experiment_scores_with_params)
...@@ -102,6 +103,46 @@ class Plotter(object): ...@@ -102,6 +103,46 @@ class Plotter(object):
fig.savefig(file_path, dpi=fig.dpi, bbox_extra_artists=(legend,), bbox_inches='tight') fig.savefig(file_path, dpi=fig.dpi, bbox_extra_artists=(legend,), bbox_inches='tight')
plt.close(fig) plt.close(fig)
@staticmethod
def plot_stage2_losses(file_path, all_experiment_scores, x_value,
xlabel, ylabel, all_labels, title):
fig, ax = plt.subplots()
n = len(all_experiment_scores)
"""
Get as many different colors from the specified cmap (here nipy_spectral)
as there are curve to plot.
"""
colors = Plotter.get_colors_from_cmap(n)
# For each curve to plot
for i in range(n):
# Retreive the scores in a list for each seed
experiment_scores = list(all_experiment_scores[i].values())
# Compute the mean and the std for the CI
mean_experiment_scores = np.average(experiment_scores, axis=0)
std_experiment_scores = np.std(experiment_scores, axis=0)
# Plot the score curve with the CI
Plotter.plot_mean_and_CI(
ax=ax,
mean=mean_experiment_scores,
lb=mean_experiment_scores + std_experiment_scores,
ub=mean_experiment_scores - std_experiment_scores,
x_value=x_value,
color_mean=colors[i],
facecolor=colors[i],
label=all_labels[i]
)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.legend(loc='upper right')
fig.savefig(file_path, dpi=fig.dpi, bbox_inches='tight')
plt.close(fig)
@staticmethod @staticmethod
def get_colors_from_cmap(n_colors, colormap_name='nipy_spectral'): def get_colors_from_cmap(n_colors, colormap_name='nipy_spectral'):
return [plt.get_cmap(colormap_name)(1. * i/n_colors) for i in range(n_colors)] return [plt.get_cmap(colormap_name)(1. * i/n_colors) for i in range(n_colors)]
...@@ -59,7 +59,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d ...@@ -59,7 +59,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1: if len(set([sum(extracted_forest_sizes) for extracted_forest_sizes in all_extracted_forest_sizes])) != 1:
raise ValueError("The extracted forest sizes aren't the sames across seeds.") raise ValueError("The extracted forest sizes aren't the sames across seeds.")
return experiment_train_scores, experiment_dev_scores, experiment_test_scores, all_extracted_forest_sizes[0] return experiment_train_scores, experiment_dev_scores, experiment_test_scores, all_extracted_forest_sizes[0], experiment_score_metrics[0]
def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number): def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experiment_id, extracted_forest_sizes_number):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
...@@ -104,7 +104,7 @@ def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experim ...@@ -104,7 +104,7 @@ def extract_scores_across_seeds_and_forest_size(models_dir, results_dir, experim
if len(set(experiment_score_metrics)) > 1: if len(set(experiment_score_metrics)) > 1:
raise ValueError("The metrics used to compute the scores aren't the same everytime") raise ValueError("The metrics used to compute the scores aren't the same everytime")
return experiment_train_scores, experiment_dev_scores, experiment_test_scores return experiment_train_scores, experiment_dev_scores, experiment_test_scores, experiment_score_metrics[0]
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
...@@ -116,7 +116,9 @@ if __name__ == "__main__": ...@@ -116,7 +116,9 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].') parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].')
parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \ parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \
'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}') 'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}' + \
'stage=2: {{no_normalization}} {{normalize_D}} {{normalize_weights}} {{normalize_D_and_weights}}' + \
'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}')
parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.') parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.')
parser.add_argument('--extracted_forest_sizes_number', nargs='?', type=int, required=True, help='Specify the number of extracted forest sizes. TODO: read it from models dir directly.') parser.add_argument('--extracted_forest_sizes_number', nargs='?', type=int, required=True, help='Specify the number of extracted forest sizes. TODO: read it from models dir directly.')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
...@@ -139,33 +141,48 @@ if __name__ == "__main__": ...@@ -139,33 +141,48 @@ if __name__ == "__main__":
# base_with_params # base_with_params
logger.info('Loading base_with_params experiment scores...') logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores = \ base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
base_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0],
args.extracted_forest_sizes_number) args.extracted_forest_sizes_number)
# random_with_params # random_with_params
logger.info('Loading random_with_params experiment scores...') logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes = extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1])
# omp_with_params # omp_with_params
logger.info('Loading omp_with_params experiment scores...') logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _ = \ omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[2]) omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[2])
# Experiments that didn't use the best hyperparameters found for this dataset # Experiments that didn't use the best hyperparameters found for this dataset
# base_wo_params # base_wo_params
logger.info('Loading base_wo_params experiment scores...') logger.info('Loading base_wo_params experiment scores...')
base_wo_params_train_scores, base_wo_params_dev_scores, base_wo_params_test_scores = \ base_wo_params_train_scores, base_wo_params_dev_scores, base_wo_params_test_scores, \
extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[3], base_wo_params_experiment_score_metric = extract_scores_across_seeds_and_forest_size(
args.models_dir, args.results_dir, args.experiment_ids[3],
args.extracted_forest_sizes_number) args.extracted_forest_sizes_number)
# random_wo_params # random_wo_params
logger.info('Loading random_wo_params experiment scores...') logger.info('Loading random_wo_params experiment scores...')
random_wo_params_train_scores, random_wo_params_dev_scores, random_wo_params_test_scores, \ random_wo_params_train_scores, random_wo_params_dev_scores, random_wo_params_test_scores, \
wo_params_extracted_forest_sizes = extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[4]) wo_params_extracted_forest_sizes, random_wo_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[4])
# base_wo_params # base_wo_params
logger.info('Loading base_wo_params experiment scores...') logger.info('Loading base_wo_params experiment scores...')
omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _ = \ omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _, \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[5]) omp_wo_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, args.experiment_ids[5])
# Sanity check on the metrics retreived
if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric ==
omp_with_params_experiment_score_metric == base_wo_params_experiment_score_metric ==
random_wo_params_experiment_score_metric ==
omp_wo_params_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = base_with_params_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage1') output_path = os.path.join(args.results_dir, args.dataset_name, 'stage1')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
...@@ -191,18 +208,111 @@ if __name__ == "__main__": ...@@ -191,18 +208,111 @@ if __name__ == "__main__":
all_labels=['base', 'random', 'omp'], all_labels=['base', 'random', 'omp'],
x_value=with_params_extracted_forest_sizes, x_value=with_params_extracted_forest_sizes,
xlabel='Number of trees extracted', xlabel='Number of trees extracted',
ylabel='MSE', # TODO: hardcoded ylabel=experiments_score_metric,
title='Loss values of {}\nusing best and default hyperparameters'.format(args.dataset_name) title='Loss values of {}\nusing best and default hyperparameters'.format(args.dataset_name)
) )
elif args.stage == 2:
if len(args.experiment_ids) != 4:
raise ValueError('In the case of stage 2, the number of specified experiment ids must be 4.')
# no_normalization
logger.info('Loading no_normalization experiment scores...')
_, _, no_normalization_test_scores, extracted_forest_sizes, no_normalization_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[0])
# normalize_D
logger.info('Loading normalize_D experiment scores...')
_, _, normalize_D_test_scores, _, normalize_D_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[1])
# normalize_weights
logger.info('Loading normalize_weights experiment scores...')
_, _, normalize_weights_test_scores, _, normalize_weights_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[2])
# normalize_D_and_weights
logger.info('Loading normalize_D_and_weights experiment scores...')
_, _, normalize_D_and_weights_test_scores, _, normalize_D_and_weights_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[3])
# Sanity check on the metrics retreived
if not (no_normalization_experiment_score_metric == normalize_D_experiment_score_metric
== normalize_weights_experiment_score_metric == normalize_D_and_weights_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = no_normalization_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage2')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[no_normalization_test_scores, normalize_D_test_scores,
normalize_weights_test_scores, normalize_D_and_weights_test_scores],
all_labels=['no_normalization', 'normalize_D', 'normalize_weights', 'normalize_D_and_weights'],
x_value=extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing different normalizations'.format(args.dataset_name))
elif args.stage == 3:
if len(args.experiment_ids) != 3:
raise ValueError('In the case of stage 3, the number of specified experiment ids must be 3.')
# train-dev_subset
logger.info('Loading train-dev_subset experiment scores...')
train_dev_subset_train_scores, train_dev_subset_dev_scores, train_dev_subset_test_scores, \
extracted_forest_sizes, train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[0])
# train-dev_train-dev_subset
logger.info('Loading train-dev_train-dev_subset experiment scores...')
train_dev_train_dev_subset_train_scores, train_dev_train_dev_subset_dev_scores, train_dev_train_dev_subset_test_scores, \
_, train_dev_train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[1])
# train-train-dev_subset
logger.info('Loading train-train-dev_subset experiment scores...')
train_train_dev_subset_train_scores, train_train_dev_subset_dev_scores, train_train_dev_subset_test_scores, \
_, train_train_dev_subset_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir,
args.experiment_ids[2])
# Sanity check on the metrics retreived
if not (train_dev_subset_experiment_score_metric == train_dev_train_dev_subset_experiment_score_metric
== train_train_dev_subset_experiment_score_metric):
raise ValueError('Score metrics of all experiments must be the same.')
experiments_score_metric = train_dev_subset_experiment_score_metric
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage3')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + 'losses.png',
all_experiment_scores=[train_dev_subset_train_scores, train_train_dev_subset_train_scores,
train_train_dev_subset_train_scores, train_dev_subset_dev_scores, train_dev_train_dev_subset_dev_scores,
train_train_dev_subset_dev_scores, train_dev_subset_test_scores, train_dev_train_dev_subset_test_scores,
train_train_dev_subset_test_scores],
all_labels=['train,dev - train', 'train+dev,train+dev - train', 'train,train+dev - train',
'train,dev - dev', 'train+dev,train+dev - dev', 'train,train+dev - dev',
'train,dev - test', 'train+dev,train+dev - test', 'train,train+dev - test'],
x_value=extracted_forest_sizes,
xlabel='Number of trees extracted',
ylabel=experiments_score_metric,
title='Loss values of {}\nusing different training subsets'.format(args.dataset_name))
else: else:
raise ValueError('This stage number is not supported yet, but it will be!') raise ValueError('This stage number is not supported yet, but it will be!')
""" """
TODO: TODO:
For each dataset: For each dataset:
[ALMOST DONE] Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams) Stage 1) [DONE for california_housing] A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations) Stage 2) [DONE for california_housing] A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations)
Stage 3) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev Stage 3) [DONE for california_housing] A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 4) A figure to finally compare the perf of our approach using the previous selected Stage 4) A figure to finally compare the perf of our approach using the previous selected
parameters vs the baseline vs other papers using different extracted forest size parameters vs the baseline vs other papers using different extracted forest size
(percentage of the tree size found previously in best hyperparams search) on the abscissa. (percentage of the tree size found previously in best hyperparams search) on the abscissa.
......
...@@ -18,6 +18,7 @@ import threading ...@@ -18,6 +18,7 @@ import threading
import json import json
from tqdm import tqdm from tqdm import tqdm
import numpy as np import numpy as np
import shutil
def process_job(seed, parameters, experiment_id, hyperparameters): def process_job(seed, parameters, experiment_id, hyperparameters):
...@@ -100,7 +101,7 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -100,7 +101,7 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
logger.info('Training done') logger.info('Training done')
""" """
Example for stage 1: Command lines example for stage 1:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05
...@@ -109,12 +110,18 @@ python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extra ...@@ -109,12 +110,18 @@ python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extra
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --forest_size=1000 --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --forest_size=1000 --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing --extracted_forest_sizes_number=5 python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing --extracted_forest_sizes_number=5
Example for stage 2: Command lines example for stage 2:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05 python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing --extracted_forest_sizes_number=5 python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing --extracted_forest_sizes_number=5
Command lines example for stage 3:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing --extracted_forest_sizes_number=5
""" """
if __name__ == "__main__": if __name__ == "__main__":
load_dotenv(find_dotenv('.env')) load_dotenv(find_dotenv('.env'))
...@@ -130,6 +137,7 @@ if __name__ == "__main__": ...@@ -130,6 +137,7 @@ if __name__ == "__main__":
end_random_seed_range = 2000 end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--experiment_id', nargs='?', type=int, default=None, help='Specify an experiment id. Remove already existing model with this specified experiment id.')
parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.') parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.') parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
...@@ -143,7 +151,7 @@ if __name__ == "__main__": ...@@ -143,7 +151,7 @@ if __name__ == "__main__":
parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--subsets_used', nargs='?', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.') parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
...@@ -204,10 +212,13 @@ if __name__ == "__main__": ...@@ -204,10 +212,13 @@ if __name__ == "__main__":
else [random.randint(begin_random_seed_range, end_random_seed_range) \ else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(parameters['random_seed_number'])] for i in range(parameters['random_seed_number'])]
if args.experiment_id:
experiment_id = args.experiment_id
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else:
# Resolve the next experiment id number (last id + 1) # Resolve the next experiment id number (last id + 1)
experiment_id = resolve_experiment_id(parameters['models_dir']) experiment_id = resolve_experiment_id(parameters['models_dir'])
logger.info('Experiment id: {}'.format(experiment_id)) logger.info('Experiment id: {}'.format(experiment_id))
parameters['experiment_id'] = experiment_id
""" """
If the experiment configuration isn't coming from If the experiment configuration isn't coming from
...@@ -224,8 +235,7 @@ if __name__ == "__main__": ...@@ -224,8 +235,7 @@ if __name__ == "__main__":
args.dataset_name, 'stage' + args.save_experiment_configuration[0]) args.dataset_name, 'stage' + args.save_experiment_configuration[0])
pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True) pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True)
output_experiment_configuration_path = os.path.join(output_experiment_stage_path, output_experiment_configuration_path = os.path.join(output_experiment_stage_path,
args.save_experiment_configuration[1] + '_{}.json'.format( args.save_experiment_configuration[1] + '.json')
experiment_id))
else: else:
pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True) pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
output_experiment_configuration_path = os.path.join( output_experiment_configuration_path = os.path.join(
......
...@@ -32,6 +32,5 @@ ...@@ -32,6 +32,5 @@
50, 50,
66, 66,
83 83
], ]
"experiment_id": 1
} }
\ No newline at end of file
...@@ -32,6 +32,5 @@ ...@@ -32,6 +32,5 @@
50, 50,
66, 66,
83 83
], ]
"experiment_id": 4
} }
\ No newline at end of file
...@@ -32,6 +32,5 @@ ...@@ -32,6 +32,5 @@
50, 50,
66, 66,
83 83
], ]
"experiment_id": 3
} }
\ No newline at end of file
...@@ -32,6 +32,5 @@ ...@@ -32,6 +32,5 @@
50, 50,
66, 66,
83 83
], ]
"experiment_id": 6
} }
\ No newline at end of file
...@@ -32,6 +32,5 @@ ...@@ -32,6 +32,5 @@
50, 50,
66, 66,
83 83
], ]
"experiment_id": 2
} }
\ No newline at end of file
...@@ -32,6 +32,5 @@ ...@@ -32,6 +32,5 @@
50, 50,
66, 66,
83 83
], ]
"experiment_id": 5
} }
\ No newline at end of file
{
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "california_housing",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "./models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"2",
"no_normalization"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
8,
17,
25,
33,
42
]
}
\ No newline at end of file
{
"experiment_id": 8,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "california_housing",
"normalize_D": true,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "./models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": false,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"2",
"normalize_D"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
8,
17,
25,
33,
42
]
}
\ No newline at end of file
{
"experiment_id": 10,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "california_housing",
"normalize_D": true,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "./models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": true,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"2",
"normalize_D_and_weights"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
8,
17,
25,
33,
42
]
}
\ No newline at end of file
{
"experiment_id": 9,
"experiment_configuration": null,
"experiment_configuration_path": "experiments",
"dataset_name": "california_housing",
"normalize_D": false,
"dataset_normalizer": "standard",
"forest_size": null,
"extracted_forest_size_samples": 5,
"extracted_forest_size_stop": 0.05,
"models_dir": "./models",
"dev_size": 0.2,
"test_size": 0.2,
"random_seed_number": 1,
"seeds": [
1,
2,
3,
4,
5
],
"subsets_used": "train,dev",
"normalize_weights": true,
"verbose": false,
"skip_best_hyperparams": false,
"save_experiment_configuration": [
"2",
"normalize_weights"
],
"job_number": -1,
"extraction_strategy": "omp",
"extracted_forest_size": [
8,
17,
25,
33,
42
]
}
\ No newline at end of file
results/california_housing/stage2/losses.png

54.8 KiB

results/california_housing/stage3/stage3_losses_all.png

65.5 KiB

results/california_housing/stage3/stage3_losses_test.png

54.9 KiB

0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment