Skip to content
Snippets Groups Projects
Commit 5ee9422b authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Handle similarity_similarities and similarity_predictions in the pipeline and...

Handle similarity_similarities and similarity_predictions in the pipeline and set lfw_pairs to binary classif (todo: change the labels for omp)
parent bab07f41
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
......@@ -81,7 +81,7 @@ class DatasetLoader(object):
elif name == 'lfw_pairs':
dataset = fetch_lfw_pairs()
X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION
task = Task.BINARYCLASSIFICATION
elif name == 'covtype':
X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION
......
......@@ -29,7 +29,7 @@ class ModelFactory(object):
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'similarity':
elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']:
return SimilarityForestClassifier(model_parameters)
else:
raise ValueError('Invalid extraction strategy')
......@@ -39,7 +39,7 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity':
elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']:
return SimilarityForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters)
......
......@@ -7,6 +7,7 @@ import argparse
import pathlib
from dotenv import find_dotenv, load_dotenv
import os
import numpy as np
def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
......@@ -17,7 +18,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes'
return len(os.listdir(extracted_forest_sizes_root_path))
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True):
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True, extracted_forest_sizes=list()):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
......@@ -45,10 +46,11 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
extracted_forest_sizes.sort(key=int)
if len(extracted_forest_sizes) == 0:
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
extracted_forest_sizes.sort(key=int)
all_extracted_forest_sizes.append(list(map(int, extracted_forest_sizes)))
for extracted_forest_size in extracted_forest_sizes:
# models/{experiment_id}/seeds/{seed}/extracted_forest_sizes/{extracted_forest_size}
......@@ -437,6 +439,15 @@ if __name__ == "__main__":
all_labels = list()
all_scores = list()
"""extracted_forest_sizes = np.unique(np.around(1000 *
np.linspace(0, 1.0,
30 + 1,
endpoint=True)[1:]).astype(np.int)).tolist()"""
extracted_forest_sizes = [4, 7, 11, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108]
extracted_forest_sizes = [str(forest_size) for forest_size in extracted_forest_sizes]
# base_with_params
logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
......@@ -447,21 +458,23 @@ if __name__ == "__main__":
logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1]))
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1]),
extracted_forest_sizes=extracted_forest_sizes)
# omp_with_params
logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[2]))
args.models_dir, args.results_dir, int(args.experiment_ids[2]), extracted_forest_sizes=extracted_forest_sizes)
#omp_with_params_without_weights
logger.info('Loading omp_with_params without weights experiment scores...')
omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False)
args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False, extracted_forest_sizes=extracted_forest_sizes)
all_labels = ['base', 'random', 'omp', 'omp_without_weights']
all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores,
omp_with_params_without_weights_test_scores]
all_labels = ['base', 'random', 'omp']
all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores]
#all_scores = [base_with_params_train_scores, random_with_params_train_scores, omp_with_params_train_scores,
# omp_with_params_without_weights_train_scores]
for i in range(3, len(args.experiment_ids)):
if 'kmeans' in args.experiment_ids[i]:
......@@ -476,16 +489,17 @@ if __name__ == "__main__":
logger.info(f'Loading {label} experiment scores...')
current_experiment_id = int(args.experiment_ids[i].split('=')[1])
_, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
current_train_scores, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, current_experiment_id)
all_labels.append(label)
all_scores.append(current_test_scores)
#all_scores.append(current_train_scores)
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses(
file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}.png",
file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_test.png",
all_experiment_scores=all_scores,
all_labels=all_labels,
x_value=with_params_extracted_forest_sizes,
......
......@@ -97,11 +97,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
continue
else:
if file_name == 'model_raw_results.pickle':
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
else:
continue
if already_exists:
logger.info('Base forest result already exists. Skipping...')
else:
......@@ -140,11 +140,11 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
continue
else:
if file_name == 'model_raw_results.pickle':
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
else:
continue
if already_exists:
logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
return
......@@ -235,7 +235,7 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans, ensemble.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity_similarities, similarity_predictions, kmeans, ensemble.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args()
......@@ -246,7 +246,7 @@ if __name__ == "__main__":
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans', 'ensemble']:
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment