diff --git a/code/bolsonaro/models/ensemble_selection_forest_regressor.py b/code/bolsonaro/models/ensemble_selection_forest_regressor.py new file mode 100644 index 0000000000000000000000000000000000000000..a20d8ec8bee5a124a10bf708ae176cf48796458e --- /dev/null +++ b/code/bolsonaro/models/ensemble_selection_forest_regressor.py @@ -0,0 +1,90 @@ +from sklearn.metrics import mean_squared_error +from sklearn.base import BaseEstimator +from sklearn.tree import DecisionTreeRegressor +from abc import abstractmethod, ABCMeta +import numpy as np +from tqdm import tqdm + + +class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): + """ + 'Ensemble selection from libraries of models' by Rich Caruana et al + """ + + def __init__(self, models_parameters, library, score_metric=mean_squared_error): + self._models_parameters = models_parameters + self._library = library + self._extracted_forest_size = self._models_parameters.extracted_forest_size + self._score_metric = score_metric + + @property + def models_parameters(self): + return self._models_parameters + + @property + def library(self): + return self._library + + def fit(self, X_train, y_train, X_val, y_val): + scores_list = list() + for estimator in self._library: + val_score = self._score_metric(estimator.predict(X_val), y_val) + scores_list.append(val_score) + + class_list = list(library) + m = np.argmax(np.asarray(scores_list)) + self._ensemble_selected = [class_list[m]] + temp_pred = class_list[m].predict(X_val) + del class_list[m] + for k in range(self._extracted_forest_size - 1): + candidate_index = 0 + best_score = 100000 + for j in range(len(class_list)): + temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val))) + temp_mean = np.mean(temp_pred, axis=0) + temp_score = self._score_metric(temp_mean, y_val) + if (temp_score < best_score): + candidate_index = j + best_score = tmp_score + temp_pred = np.delete(temp_pred, -1, 0) + self._ensemble_selected.append(class_list[candidate_index]) + temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val))) + del class_list[candidate_index] + + def score(self, X, y): + predictions = self._predict_base_estimator(X) + mean_predictions = np.mean(predictions, axis=0) + return self._score_metric(mean_predictions, y) + + def predict_base_estimator(self, X): + predictions = list() + for tree in self._ensemble_selected: + predictions.append(tree.predict(X)) + return np.array(predictions) + + @staticmethod + def generate_library(X_train, y_train, random_state=None): + criterion_arr = ["mse"]#, "friedman_mse", "mae"] + splitter_arr = ["best"]#, "random"] + depth_arr = [i for i in range(5, 20, 1)] + min_samples_split_arr = [i for i in range(2, 20, 1)] + min_samples_leaf_arr = [i for i in range(2, 20, 1)] + max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"] + + library = list() + with tqdm(total=len(criterion_arr) * len(splitter_arr) * \ + len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \ + len(max_features_arr)) as bar: + bar.set_description('Generating library') + for criterion in criterion_arr: + for splitter in splitter_arr: + for depth in depth_arr: + for min_samples_split in min_samples_split_arr: + for min_samples_leaf in min_samples_leaf_arr: + for max_features in max_features_arr: + t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state) + t.fit(X_train, y_train) + library.append(t) + bar.update(1) + return library diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index bbda6cae89d218c7831780f71b9fc6a7bc022d54..eb3e8b50d7411a2beee8e79bf7da46f6561558a2 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor +from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.data.task import Task from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier @@ -13,7 +14,7 @@ import pickle class ModelFactory(object): @staticmethod - def build(task, model_parameters): + def build(task, model_parameters, library=None): if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]: raise ValueError("Unsupported task '{}'".format(task)) @@ -38,6 +39,8 @@ class ModelFactory(object): return SimilarityForestRegressor(model_parameters) elif model_parameters.extraction_strategy == 'kmeans': return KMeansForestRegressor(model_parameters) + elif model_parameters.extraction_strategy == 'ensemble': + return EnsembleSelectionForestRegressor(model_parameters, library=library) elif model_parameters.extraction_strategy == 'none': return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], random_state=model_parameters.seed) diff --git a/code/compute_results.py b/code/compute_results.py index 3bd2f8a22cab9fe5dc224bd36fbc90e154b9f558..406f0e0636111add9f3cb6075fe26fd91ff3cf4d 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -133,10 +133,11 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].') - parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \ + parser.add_argument('--experiment_ids', nargs='+', type=str, required=True, help='Compute the results of the specified experiment id(s).' + \ 'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}' + \ 'stage=2: {{no_normalization}} {{normalize_D}} {{normalize_weights}} {{normalize_D_and_weights}}' + \ - 'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}') + 'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}' + \ + 'stage=5: {{base_with_params}} {{random_with_params}} {{omp_with_params}} [ensemble={{id}}] [similarity={{id}}] [kmean={{id}}]') parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') @@ -159,7 +160,7 @@ if __name__ == "__main__": raise ValueError('In the case of stage 1, the number of specified experiment ids must be 6.') # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary - extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1]) + extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1])) # Experiments that used the best hyperparameters found for this dataset @@ -167,18 +168,18 @@ if __name__ == "__main__": logger.info('Loading base_with_params experiment scores...') base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ base_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]), extracted_forest_sizes_number) # random_with_params logger.info('Loading random_with_params experiment scores...') random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) # omp_with_params logger.info('Loading omp_with_params experiment scores...') omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2]) + args.models_dir, args.results_dir, int(args.experiment_ids[2])) # Experiments that didn't use the best hyperparameters found for this dataset @@ -186,19 +187,19 @@ if __name__ == "__main__": logger.info('Loading base_wo_params experiment scores...') base_wo_params_train_scores, base_wo_params_dev_scores, base_wo_params_test_scores, \ base_wo_params_experiment_score_metric = extract_scores_across_seeds_and_forest_size( - args.models_dir, args.results_dir, args.experiment_ids[3], + args.models_dir, args.results_dir, int(args.experiment_ids[3]), extracted_forest_sizes_number) # random_wo_params logger.info('Loading random_wo_params experiment scores...') random_wo_params_train_scores, random_wo_params_dev_scores, random_wo_params_test_scores, \ wo_params_extracted_forest_sizes, random_wo_params_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[4]) + args.models_dir, args.results_dir, int(args.experiment_ids[4])) # base_wo_params logger.info('Loading base_wo_params experiment scores...') omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _, \ omp_wo_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[5]) + args.models_dir, args.results_dir, int(args.experiment_ids[5])) # Sanity check on the metrics retreived if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric == @@ -243,25 +244,25 @@ if __name__ == "__main__": logger.info('Loading no_normalization experiment scores...') _, _, no_normalization_test_scores, extracted_forest_sizes, no_normalization_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[0]) + int(args.experiment_ids[0])) # normalize_D logger.info('Loading normalize_D experiment scores...') _, _, normalize_D_test_scores, _, normalize_D_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[1]) + int(args.experiment_ids[1])) # normalize_weights logger.info('Loading normalize_weights experiment scores...') _, _, normalize_weights_test_scores, _, normalize_weights_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[2]) + int(args.experiment_ids[2])) # normalize_D_and_weights logger.info('Loading normalize_D_and_weights experiment scores...') _, _, normalize_D_and_weights_test_scores, _, normalize_D_and_weights_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[3]) + int(args.experiment_ids[3])) # Sanity check on the metrics retreived if not (no_normalization_experiment_score_metric == normalize_D_experiment_score_metric @@ -290,21 +291,21 @@ if __name__ == "__main__": train_dev_subset_train_scores, train_dev_subset_dev_scores, train_dev_subset_test_scores, \ extracted_forest_sizes, train_dev_subset_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[0]) + int(args.experiment_ids[0])) # train-dev_train-dev_subset logger.info('Loading train-dev_train-dev_subset experiment scores...') train_dev_train_dev_subset_train_scores, train_dev_train_dev_subset_dev_scores, train_dev_train_dev_subset_test_scores, \ _, train_dev_train_dev_subset_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[1]) + int(args.experiment_ids[1])) # train-train-dev_subset logger.info('Loading train-train-dev_subset experiment scores...') train_train_dev_subset_train_scores, train_train_dev_subset_dev_scores, train_train_dev_subset_test_scores, \ _, train_train_dev_subset_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[2]) + int(args.experiment_ids[2])) # Sanity check on the metrics retreived if not (train_dev_subset_experiment_score_metric == train_dev_train_dev_subset_experiment_score_metric @@ -349,13 +350,13 @@ if __name__ == "__main__": logger.info('Loading base_with_params experiment scores...') base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ base_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]), extracted_forest_sizes_number) # random_with_params logger.info('Loading random_with_params experiment scores...') random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) # omp_with_params logger.info('Loading omp_with_params experiment scores...') """omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ @@ -363,12 +364,12 @@ if __name__ == "__main__": args.models_dir, args.results_dir, args.experiment_ids[2])""" omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2]) + args.models_dir, args.results_dir, int(args.experiment_ids[2])) #omp_with_params_without_weights logger.info('Loading omp_with_params experiment scores...') omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2], weights=False) + args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False) """# base_with_params logger.info('Loading base_with_params experiment scores 2...') @@ -402,57 +403,63 @@ if __name__ == "__main__": title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) elif args.stage == 5: # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary - extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1]) + extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1])) + all_labels = list() + all_scores = list() # base_with_params logger.info('Loading base_with_params experiment scores...') base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ base_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]), extracted_forest_sizes_number) # random_with_params logger.info('Loading random_with_params experiment scores...') random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) # omp_with_params logger.info('Loading omp_with_params experiment scores...') omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2]) + args.models_dir, args.results_dir, int(args.experiment_ids[2])) #omp_with_params_without_weights logger.info('Loading omp_with_params experiment scores...') omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2], weights=False) - # kmeans_with_params - logger.info('Loading kmeans_with_params experiment scores...') - kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \ - kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[3]) - # similarity_with_params - logger.info('Loading similarity_with_params experiment scores...') - similarity_with_params_train_scores, similarity_with_params_dev_scores, similarity_with_params_test_scores, _, \ - similarity_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[4]) + args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False) + + all_labels = ['base', 'random', 'omp', 'omp_without_weights'] + all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, + omp_with_params_without_weights_test_scores] + + for i in range(3, len(args.experiment_ids)): + if 'kmeans' in args.experiment_ids[i]: + label = 'kmeans' + elif 'similarity' in args.experiment_ids[i]: + label = 'similarity' + elif 'ensemble' in args.experiment_ids[i]: + label = 'ensemble' + else: + logger.error('Invalid value encountered') + continue - # Sanity check on the metrics retreived - if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric - == omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric): - raise ValueError('Score metrics of all experiments must be the same.') - experiments_score_metric = base_with_params_experiment_score_metric + logger.info(f'Loading {label} experiment scores...') + _, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, int(args.experiment_ids[i].split('=')[1])) + all_labels.append(label) + all_scores.append(current_test_scores) output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5') pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) Plotter.plot_stage2_losses( - file_path=output_path + os.sep + 'losses.png', - all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, - omp_with_params_without_weights_test_scores, kmeans_with_params_test_scores, similarity_with_params_test_scores], - all_labels=['base', 'random', 'omp', 'omp_without_weights', 'kmeans', 'similarity'], + file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}.png", + all_experiment_scores=all_scores, + all_labels=all_labels, x_value=with_params_extracted_forest_sizes, xlabel='Number of trees extracted', - ylabel=experiments_score_metric, + ylabel=base_with_params_experiment_score_metric, title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) else: raise ValueError('This stage number is not supported yet, but it will be!') diff --git a/code/train.py b/code/train.py index 1d75e98b9044165abb075a346761a910d8479a83..b7ea2a77ab08cac3e544c12e2d0c3036ae83f379 100644 --- a/code/train.py +++ b/code/train.py @@ -2,6 +2,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_loader import DatasetLoader from bolsonaro.models.model_factory import ModelFactory from bolsonaro.models.model_parameters import ModelParameters +from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.trainer import Trainer from bolsonaro.utils import resolve_experiment_id, tqdm_joblib from bolsonaro import LOG_PATH @@ -53,10 +54,15 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb trainer = Trainer(dataset) + if parameters['extraction_strategy'] == 'ensemble': + library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed) + else: + library = None + if parameters['extraction_strategy'] != 'none': with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i], - models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer) + models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library) for i in range(len(parameters['extracted_forest_size']))) else: forest_size = hyperparameters['n_estimators'] @@ -88,7 +94,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ) model_parameters.save(sub_models_dir, experiment_id) - model = ModelFactory.build(dataset.task, model_parameters) + model = ModelFactory.build(dataset.task, model_parameters, library=library) trainer.init(model, subsets_used=parameters['subsets_used']) trainer.train(model) @@ -97,7 +103,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb seed_job_pb.update(1) def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir, - seed, parameters, dataset, hyperparameters, experiment_id, trainer): + seed, parameters, dataset, hyperparameters, experiment_id, trainer, library): logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format( seed, extracted_forest_size, threading.get_ident())) @@ -132,7 +138,7 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz ) model_parameters.save(sub_models_dir, experiment_id) - model = ModelFactory.build(dataset.task, model_parameters) + model = ModelFactory.build(dataset.task, model_parameters, library=library) trainer.init(model, subsets_used=parameters['subsets_used']) trainer.train(model) @@ -202,7 +208,7 @@ if __name__ == "__main__": parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') - parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.') + parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans, ensemble.') parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id') args = parser.parse_args() @@ -213,7 +219,7 @@ if __name__ == "__main__": else: parameters = args.__dict__ - if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']: + if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans', 'ensemble']: raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) diff --git a/scripts/run_stage5_experiments_ensemble.sh b/scripts/run_stage5_experiments_ensemble.sh new file mode 100755 index 0000000000000000000000000000000000000000..7387a97abae531c34a47a038ccc964a787eebcfe --- /dev/null +++ b/scripts/run_stage5_experiments_ensemble.sh @@ -0,0 +1,12 @@ +#!/bin/bash +core_number=1 +walltime=5:00 +seeds='1 2 3' + +for dataset in california_housing # kin8nm kr-vs-kp spambase steel-plates diabetes diamonds boston california_housing +do + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=ensemble --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" +done