Skip to content
Snippets Groups Projects
Commit 5ee9422b authored by Charly Lamothe's avatar Charly Lamothe
Browse files

Handle similarity_similarities and similarity_predictions in the pipeline and...

Handle similarity_similarities and similarity_predictions in the pipeline and set lfw_pairs to binary classif (todo: change the labels for omp)
parent bab07f41
No related branches found
No related tags found
1 merge request!23Resolve "integration-sota"
...@@ -81,7 +81,7 @@ class DatasetLoader(object): ...@@ -81,7 +81,7 @@ class DatasetLoader(object):
elif name == 'lfw_pairs': elif name == 'lfw_pairs':
dataset = fetch_lfw_pairs() dataset = fetch_lfw_pairs()
X, y = dataset.data, dataset.target X, y = dataset.data, dataset.target
task = Task.MULTICLASSIFICATION task = Task.BINARYCLASSIFICATION
elif name == 'covtype': elif name == 'covtype':
X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True) X, y = fetch_covtype(random_state=dataset_parameters.random_state, shuffle=True, return_X_y=True)
task = Task.MULTICLASSIFICATION task = Task.MULTICLASSIFICATION
......
...@@ -29,7 +29,7 @@ class ModelFactory(object): ...@@ -29,7 +29,7 @@ class ModelFactory(object):
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'kmeans': elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestClassifier(model_parameters) return KMeansForestClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'similarity': elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']:
return SimilarityForestClassifier(model_parameters) return SimilarityForestClassifier(model_parameters)
else: else:
raise ValueError('Invalid extraction strategy') raise ValueError('Invalid extraction strategy')
...@@ -39,7 +39,7 @@ class ModelFactory(object): ...@@ -39,7 +39,7 @@ class ModelFactory(object):
elif model_parameters.extraction_strategy == 'random': elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(**model_parameters.hyperparameters, return RandomForestRegressor(**model_parameters.hyperparameters,
random_state=model_parameters.seed) random_state=model_parameters.seed)
elif model_parameters.extraction_strategy == 'similarity': elif model_parameters.extraction_strategy in ['similarity_similarities', 'similarity_predictions']:
return SimilarityForestRegressor(model_parameters) return SimilarityForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'kmeans': elif model_parameters.extraction_strategy == 'kmeans':
return KMeansForestRegressor(model_parameters) return KMeansForestRegressor(model_parameters)
......
...@@ -7,6 +7,7 @@ import argparse ...@@ -7,6 +7,7 @@ import argparse
import pathlib import pathlib
from dotenv import find_dotenv, load_dotenv from dotenv import find_dotenv, load_dotenv
import os import os
import numpy as np
def retreive_extracted_forest_sizes_number(models_dir, experiment_id): def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
...@@ -17,7 +18,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id): ...@@ -17,7 +18,7 @@ def retreive_extracted_forest_sizes_number(models_dir, experiment_id):
extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes' extracted_forest_sizes_root_path = experiment_seed_path + os.sep + 'extracted_forest_sizes'
return len(os.listdir(extracted_forest_sizes_root_path)) return len(os.listdir(extracted_forest_sizes_root_path))
def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True): def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_dir, experiment_id, weights=True, extracted_forest_sizes=list()):
experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_id_path = models_dir + os.sep + str(experiment_id) # models/{experiment_id}
experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
...@@ -45,6 +46,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d ...@@ -45,6 +46,7 @@ def extract_scores_across_seeds_and_extracted_forest_sizes(models_dir, results_d
experiment_dev_scores[seed] = list() experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list() experiment_test_scores[seed] = list()
if len(extracted_forest_sizes) == 0:
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_sizes
extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path) extracted_forest_sizes = os.listdir(extracted_forest_sizes_root_path)
extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ] extracted_forest_sizes = [nb_tree for nb_tree in extracted_forest_sizes if not 'no_weights' in nb_tree ]
...@@ -437,6 +439,15 @@ if __name__ == "__main__": ...@@ -437,6 +439,15 @@ if __name__ == "__main__":
all_labels = list() all_labels = list()
all_scores = list() all_scores = list()
"""extracted_forest_sizes = np.unique(np.around(1000 *
np.linspace(0, 1.0,
30 + 1,
endpoint=True)[1:]).astype(np.int)).tolist()"""
extracted_forest_sizes = [4, 7, 11, 14, 18, 22, 25, 29, 32, 36, 40, 43, 47, 50, 54, 58, 61, 65, 68, 72, 76, 79, 83, 86, 90, 94, 97, 101, 104, 108]
extracted_forest_sizes = [str(forest_size) for forest_size in extracted_forest_sizes]
# base_with_params # base_with_params
logger.info('Loading base_with_params experiment scores...') logger.info('Loading base_with_params experiment scores...')
base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \
...@@ -447,21 +458,23 @@ if __name__ == "__main__": ...@@ -447,21 +458,23 @@ if __name__ == "__main__":
logger.info('Loading random_with_params experiment scores...') logger.info('Loading random_with_params experiment scores...')
random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \
with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \
extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1]),
extracted_forest_sizes=extracted_forest_sizes)
# omp_with_params # omp_with_params
logger.info('Loading omp_with_params experiment scores...') logger.info('Loading omp_with_params experiment scores...')
omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[2])) args.models_dir, args.results_dir, int(args.experiment_ids[2]), extracted_forest_sizes=extracted_forest_sizes)
#omp_with_params_without_weights #omp_with_params_without_weights
logger.info('Loading omp_with_params without weights experiment scores...') logger.info('Loading omp_with_params without weights experiment scores...')
omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \ omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \
omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False) args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False, extracted_forest_sizes=extracted_forest_sizes)
all_labels = ['base', 'random', 'omp', 'omp_without_weights'] all_labels = ['base', 'random', 'omp']
all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores]
omp_with_params_without_weights_test_scores] #all_scores = [base_with_params_train_scores, random_with_params_train_scores, omp_with_params_train_scores,
# omp_with_params_without_weights_train_scores]
for i in range(3, len(args.experiment_ids)): for i in range(3, len(args.experiment_ids)):
if 'kmeans' in args.experiment_ids[i]: if 'kmeans' in args.experiment_ids[i]:
...@@ -476,16 +489,17 @@ if __name__ == "__main__": ...@@ -476,16 +489,17 @@ if __name__ == "__main__":
logger.info(f'Loading {label} experiment scores...') logger.info(f'Loading {label} experiment scores...')
current_experiment_id = int(args.experiment_ids[i].split('=')[1]) current_experiment_id = int(args.experiment_ids[i].split('=')[1])
_, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes( current_train_scores, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes(
args.models_dir, args.results_dir, current_experiment_id) args.models_dir, args.results_dir, current_experiment_id)
all_labels.append(label) all_labels.append(label)
all_scores.append(current_test_scores) all_scores.append(current_test_scores)
#all_scores.append(current_train_scores)
output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5') output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5')
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)
Plotter.plot_stage2_losses( Plotter.plot_stage2_losses(
file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}.png", file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}_test.png",
all_experiment_scores=all_scores, all_experiment_scores=all_scores,
all_labels=all_labels, all_labels=all_labels,
x_value=with_params_extracted_forest_sizes, x_value=with_params_extracted_forest_sizes,
......
...@@ -97,11 +97,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ...@@ -97,11 +97,11 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb
if os.path.isdir(sub_models_dir): if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir) sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files: for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]: if file_name == 'model_raw_results.pickle':
continue
else:
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0 already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break break
else:
continue
if already_exists: if already_exists:
logger.info('Base forest result already exists. Skipping...') logger.info('Base forest result already exists. Skipping...')
else: else:
...@@ -140,11 +140,11 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz ...@@ -140,11 +140,11 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz
if os.path.isdir(sub_models_dir): if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir) sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files: for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]: if file_name == 'model_raw_results.pickle':
continue
else:
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0 already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break break
else:
continue
if already_exists: if already_exists:
logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...') logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
return return
...@@ -235,7 +235,7 @@ if __name__ == "__main__": ...@@ -235,7 +235,7 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans, ensemble.') parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity_similarities, similarity_predictions, kmeans, ensemble.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id') parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args() args = parser.parse_args()
...@@ -246,7 +246,7 @@ if __name__ == "__main__": ...@@ -246,7 +246,7 @@ if __name__ == "__main__":
else: else:
parameters = args.__dict__ parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans', 'ensemble']: if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity_similarities', 'similarity_predictions', 'kmeans', 'ensemble']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment