diff --git a/TODO.md b/TODO.md index b94e576024a5294b6eaffaaf1af5003f0e034313..3a7a10fcedae60aed1429b1bfdecaa467ca761ac 100644 --- a/TODO.md +++ b/TODO.md @@ -1,3 +1 @@ -* Fix model results loading in compute_results.py. -* Check that omp multiclasses classifier is working as expected. -* Fix the dataset error of fetcher when job_number > 1. \ No newline at end of file +* Check that omp multiclasses classifier is working as expected. \ No newline at end of file diff --git a/code/bolsonaro/models/ensemble_selection_forest_regressor.py b/code/bolsonaro/models/ensemble_selection_forest_regressor.py new file mode 100644 index 0000000000000000000000000000000000000000..b82e131d296392963e31d85f5c1444fc5cb7fd09 --- /dev/null +++ b/code/bolsonaro/models/ensemble_selection_forest_regressor.py @@ -0,0 +1,90 @@ +from sklearn.metrics import mean_squared_error +from sklearn.base import BaseEstimator +from sklearn.tree import DecisionTreeRegressor +from abc import abstractmethod, ABCMeta +import numpy as np +from tqdm import tqdm + + +class EnsembleSelectionForestRegressor(BaseEstimator, metaclass=ABCMeta): + """ + 'Ensemble selection from libraries of models' by Rich Caruana et al + """ + + def __init__(self, models_parameters, library, score_metric=mean_squared_error): + self._models_parameters = models_parameters + self._library = library + self._extracted_forest_size = self._models_parameters.extracted_forest_size + self._score_metric = score_metric + + @property + def models_parameters(self): + return self._models_parameters + + @property + def library(self): + return self._library + + def fit(self, X_train, y_train, X_val, y_val): + scores_list = list() + for estimator in self._library: + val_score = self._score_metric(estimator.predict(X_val), y_val) + scores_list.append(val_score) + + class_list = list(self._library) + m = np.argmax(np.asarray(scores_list)) + self._ensemble_selected = [class_list[m]] + temp_pred = class_list[m].predict(X_val) + del class_list[m] + for k in range(self._extracted_forest_size - 1): + candidate_index = 0 + best_score = 100000 + for j in range(len(class_list)): + temp_pred = np.vstack((temp_pred, class_list[j].predict(X_val))) + temp_mean = np.mean(temp_pred, axis=0) + temp_score = self._score_metric(temp_mean, y_val) + if (temp_score < best_score): + candidate_index = j + best_score = temp_score + temp_pred = np.delete(temp_pred, -1, 0) + self._ensemble_selected.append(class_list[candidate_index]) + temp_pred = np.vstack((temp_pred, class_list[candidate_index].predict(X_val))) + del class_list[candidate_index] + + def score(self, X, y): + predictions = self._predict_base_estimator(X) + return self._score_metric(predictions, y) + + def predict_base_estimator(self, X): + predictions = list() + for tree in self._ensemble_selected: + predictions.append(tree.predict(X)) + mean_predictions = np.mean(np.array(predictions), axis=0) + return mean_predictions + + @staticmethod + def generate_library(X_train, y_train, random_state=None): + criterion_arr = ["mse"]#, "friedman_mse", "mae"] + splitter_arr = ["best"]#, "random"] + depth_arr = [i for i in range(5, 20, 1)] + min_samples_split_arr = [i for i in range(2, 20, 1)] + min_samples_leaf_arr = [i for i in range(2, 20, 1)] + max_features_arr = ["sqrt"]#["auto", "sqrt", "log2"] + + library = list() + with tqdm(total=len(criterion_arr) * len(splitter_arr) * \ + len(depth_arr) * len(min_samples_split_arr) * len(min_samples_leaf_arr) * \ + len(max_features_arr)) as bar: + bar.set_description('Generating library') + for criterion in criterion_arr: + for splitter in splitter_arr: + for depth in depth_arr: + for min_samples_split in min_samples_split_arr: + for min_samples_leaf in min_samples_leaf_arr: + for max_features in max_features_arr: + t = DecisionTreeRegressor(criterion=criterion, splitter=splitter, max_depth=depth, min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, max_features=max_features, random_state=random_state) + t.fit(X_train, y_train) + library.append(t) + bar.update(1) + return library diff --git a/code/bolsonaro/models/model_factory.py b/code/bolsonaro/models/model_factory.py index bbda6cae89d218c7831780f71b9fc6a7bc022d54..335816b1dd33d28175f4865da2fddbbf73b8027d 100644 --- a/code/bolsonaro/models/model_factory.py +++ b/code/bolsonaro/models/model_factory.py @@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor +from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.data.task import Task from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier @@ -13,7 +14,7 @@ import pickle class ModelFactory(object): @staticmethod - def build(task, model_parameters): + def build(task, model_parameters, library=None): if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]: raise ValueError("Unsupported task '{}'".format(task)) @@ -21,10 +22,10 @@ class ModelFactory(object): if model_parameters.extraction_strategy == 'omp': return OmpForestBinaryClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': - return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'none': - return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') @@ -32,14 +33,16 @@ class ModelFactory(object): if model_parameters.extraction_strategy == 'omp': return OmpForestRegressor(model_parameters) elif model_parameters.extraction_strategy == 'random': - return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size, + return RandomForestRegressor(**model_parameters.hyperparameters, random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'similarity': return SimilarityForestRegressor(model_parameters) elif model_parameters.extraction_strategy == 'kmeans': return KMeansForestRegressor(model_parameters) + elif model_parameters.extraction_strategy == 'ensemble': + return EnsembleSelectionForestRegressor(model_parameters, library=library) elif model_parameters.extraction_strategy == 'none': - return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'], + return RandomForestRegressor(**model_parameters.hyperparameters, random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') @@ -47,10 +50,10 @@ class ModelFactory(object): if model_parameters.extraction_strategy == 'omp': return OmpForestMulticlassClassifier(model_parameters) elif model_parameters.extraction_strategy == 'random': - return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size, + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) elif model_parameters.extraction_strategy == 'none': - return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'], + return RandomForestClassifier(**model_parameters.hyperparameters, random_state=model_parameters.seed) else: raise ValueError('Invalid extraction strategy') diff --git a/code/bolsonaro/models/omp_forest.py b/code/bolsonaro/models/omp_forest.py index 5b947d327693020b51c7da778d4855274454de93..68b394775007946cf6259a8e259c12758012da31 100644 --- a/code/bolsonaro/models/omp_forest.py +++ b/code/bolsonaro/models/omp_forest.py @@ -134,16 +134,13 @@ class SingleOmpForest(OmpForest): Make all the base tree predictions :param X: a Forest - :return: a np.array of the predictions of the entire forest + :return: a np.array of the predictions of the trees selected by OMP without applyong the weight """ - forest_predictions = self._base_estimator_predictions(X).T + forest_predictions = np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]) if self._models_parameters.normalize_D: forest_predictions /= self._forest_norms weights = self._omp.coef_ - omp_trees_indices = np.nonzero(weights)[0] - - select_trees = np.mean(forest_predictions[omp_trees_indices], axis=0) - print(len(omp_trees_indices)) + select_trees = np.mean(forest_predictions[weights != 0], axis=0) return select_trees diff --git a/code/bolsonaro/models/omp_forest_classifier.py b/code/bolsonaro/models/omp_forest_classifier.py index a51405a6a3278bb86dd52d011b599175bbfc7482..3051fad09c04c34bab5f7035f7392c09313c7b30 100644 --- a/code/bolsonaro/models/omp_forest_classifier.py +++ b/code/bolsonaro/models/omp_forest_classifier.py @@ -40,9 +40,7 @@ class OmpForestBinaryClassifier(SingleOmpForest): forest_predictions /= self._forest_norms weights = self._omp.coef_ - omp_trees_indices = np.nonzero(weights) - - omp_trees_predictions = forest_predictions[omp_trees_indices].T[1] + omp_trees_predictions = forest_predictions[weights != 0].T[1] # Here forest_pred is the probability of being class 1. diff --git a/code/bolsonaro/trainer.py b/code/bolsonaro/trainer.py index 7070126e2a9a8f449757bdab9381b4bffab99b2d..6fcf0aff551263c363bfa97fcfa31b3ffe8b15b5 100644 --- a/code/bolsonaro/trainer.py +++ b/code/bolsonaro/trainer.py @@ -2,6 +2,8 @@ from bolsonaro.models.model_raw_results import ModelRawResults from bolsonaro.models.omp_forest_regressor import OmpForestRegressor from bolsonaro.models.omp_forest_classifier import OmpForestBinaryClassifier, OmpForestMulticlassClassifier from bolsonaro.models.similarity_forest_regressor import SimilarityForestRegressor +from bolsonaro.models.kmeans_forest_regressor import KMeansForestRegressor +from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.data.task import Task from . import LOG_PATH @@ -72,20 +74,25 @@ class Trainer(object): else: raise ValueError("Unknown specified subsets_used parameter '{}'".format(model.models_parameters.subsets_used)) - def train(self, model): + def train(self, model, extracted_forest_size=None): """ :param model: An instance of either RandomForestRegressor, RandomForestClassifier, OmpForestRegressor, OmpForestBinaryClassifier, OmpForestMulticlassClassifier. :return: """ - self._logger.debug('Training model using train set...') self._begin_time = time.time() if type(model) in [RandomForestRegressor, RandomForestClassifier]: - model.fit( - X=self._X_forest, - y=self._y_forest - ) + if extracted_forest_size is not None: + estimators_index = np.arange(1000) + np.random.shuffle(estimators_index) + choosen_estimators = estimators_index[:extracted_forest_size] + model.estimators_ = np.array(model.estimators_)[choosen_estimators] + else: + model.fit( + X=self._X_forest, + y=self._y_forest + ) else: model.fit( self._X_forest, @@ -96,7 +103,7 @@ class Trainer(object): self._end_time = time.time() def __score_func(self, model, X, y_true, weights=True): - if type(model) in [OmpForestRegressor, RandomForestRegressor, SimilarityForestRegressor]: + if type(model) in [OmpForestRegressor, RandomForestRegressor]: if weights: y_pred = model.predict(X) else: @@ -109,12 +116,14 @@ class Trainer(object): y_pred = model.predict_no_weights(X) if type(model) is OmpForestBinaryClassifier: y_pred = np.sign(y_pred) - y_pred = np.where(y_pred==0, 1, y_pred) + y_pred = np.where(y_pred == 0, 1, y_pred) result = self._classification_score_metric(y_true, y_pred) + elif type(model) in [SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: + result = model.score(X, y_true) return result def __score_func_base(self, model, X, y_true): - if type(model) == OmpForestRegressor: + if type(model) in [OmpForestRegressor, SimilarityForestRegressor, KMeansForestRegressor, EnsembleSelectionForestRegressor]: y_pred = model.predict_base_estimator(X) result = self._base_regression_score_metric(y_true, y_pred) elif type(model) in [OmpForestBinaryClassifier, OmpForestMulticlassClassifier]: @@ -123,7 +132,7 @@ class Trainer(object): elif type(model) == RandomForestClassifier: y_pred = model.predict(X) result = self._base_classification_score_metric(y_true, y_pred) - elif type(model) in [RandomForestRegressor, SimilarityForestRegressor]: + elif type(model) is RandomForestRegressor: y_pred = model.predict(X) result = self._base_regression_score_metric(y_true, y_pred) return result diff --git a/code/compute_results.py b/code/compute_results.py index 5f7fac2c7718cf887d3d83a5b3a7eb9cdebfb9d9..96bba0008b3239f4f94310ebded1cfa70d45acb3 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -133,10 +133,11 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 5].') - parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \ + parser.add_argument('--experiment_ids', nargs='+', type=str, required=True, help='Compute the results of the specified experiment id(s).' + \ 'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}' + \ 'stage=2: {{no_normalization}} {{normalize_D}} {{normalize_weights}} {{normalize_D_and_weights}}' + \ - 'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}') + 'stage=3: {{train-dev_subset}} {{train-dev_train-dev_subset}} {{train-train-dev_subset}}' + \ + 'stage=5: {{base_with_params}} {{random_with_params}} {{omp_with_params}} [ensemble={{id}}] [similarity={{id}}] [kmean={{id}}]') parser.add_argument('--dataset_name', nargs='?', type=str, required=True, help='Specify the dataset name. TODO: read it from models dir directly.') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') @@ -159,7 +160,7 @@ if __name__ == "__main__": raise ValueError('In the case of stage 1, the number of specified experiment ids must be 6.') # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary - extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1]) + extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1])) # Experiments that used the best hyperparameters found for this dataset @@ -167,18 +168,18 @@ if __name__ == "__main__": logger.info('Loading base_with_params experiment scores...') base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ base_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]), extracted_forest_sizes_number) # random_with_params logger.info('Loading random_with_params experiment scores...') random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) # omp_with_params logger.info('Loading omp_with_params experiment scores...') omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2]) + args.models_dir, args.results_dir, int(args.experiment_ids[2])) # Experiments that didn't use the best hyperparameters found for this dataset @@ -186,19 +187,19 @@ if __name__ == "__main__": logger.info('Loading base_wo_params experiment scores...') base_wo_params_train_scores, base_wo_params_dev_scores, base_wo_params_test_scores, \ base_wo_params_experiment_score_metric = extract_scores_across_seeds_and_forest_size( - args.models_dir, args.results_dir, args.experiment_ids[3], + args.models_dir, args.results_dir, int(args.experiment_ids[3]), extracted_forest_sizes_number) # random_wo_params logger.info('Loading random_wo_params experiment scores...') random_wo_params_train_scores, random_wo_params_dev_scores, random_wo_params_test_scores, \ wo_params_extracted_forest_sizes, random_wo_params_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[4]) + args.models_dir, args.results_dir, int(args.experiment_ids[4])) # base_wo_params logger.info('Loading base_wo_params experiment scores...') omp_wo_params_train_scores, omp_wo_params_dev_scores, omp_wo_params_test_scores, _, \ omp_wo_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[5]) + args.models_dir, args.results_dir, int(args.experiment_ids[5])) # Sanity check on the metrics retreived if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric == @@ -243,25 +244,25 @@ if __name__ == "__main__": logger.info('Loading no_normalization experiment scores...') _, _, no_normalization_test_scores, extracted_forest_sizes, no_normalization_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[0]) + int(args.experiment_ids[0])) # normalize_D logger.info('Loading normalize_D experiment scores...') _, _, normalize_D_test_scores, _, normalize_D_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[1]) + int(args.experiment_ids[1])) # normalize_weights logger.info('Loading normalize_weights experiment scores...') _, _, normalize_weights_test_scores, _, normalize_weights_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[2]) + int(args.experiment_ids[2])) # normalize_D_and_weights logger.info('Loading normalize_D_and_weights experiment scores...') _, _, normalize_D_and_weights_test_scores, _, normalize_D_and_weights_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[3]) + int(args.experiment_ids[3])) # Sanity check on the metrics retreived if not (no_normalization_experiment_score_metric == normalize_D_experiment_score_metric @@ -290,21 +291,21 @@ if __name__ == "__main__": train_dev_subset_train_scores, train_dev_subset_dev_scores, train_dev_subset_test_scores, \ extracted_forest_sizes, train_dev_subset_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[0]) + int(args.experiment_ids[0])) # train-dev_train-dev_subset logger.info('Loading train-dev_train-dev_subset experiment scores...') train_dev_train_dev_subset_train_scores, train_dev_train_dev_subset_dev_scores, train_dev_train_dev_subset_test_scores, \ _, train_dev_train_dev_subset_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[1]) + int(args.experiment_ids[1])) # train-train-dev_subset logger.info('Loading train-train-dev_subset experiment scores...') train_train_dev_subset_train_scores, train_train_dev_subset_dev_scores, train_train_dev_subset_test_scores, \ _, train_train_dev_subset_experiment_score_metric = \ extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, - args.experiment_ids[2]) + int(args.experiment_ids[2])) # Sanity check on the metrics retreived if not (train_dev_subset_experiment_score_metric == train_dev_train_dev_subset_experiment_score_metric @@ -349,13 +350,13 @@ if __name__ == "__main__": logger.info('Loading base_with_params experiment scores...') base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ base_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]), extracted_forest_sizes_number) # random_with_params logger.info('Loading random_with_params experiment scores...') random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) # omp_with_params logger.info('Loading omp_with_params experiment scores...') """omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ @@ -363,12 +364,12 @@ if __name__ == "__main__": args.models_dir, args.results_dir, args.experiment_ids[2])""" omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2]) + args.models_dir, args.results_dir, int(args.experiment_ids[2])) #omp_with_params_without_weights - logger.info('Loading omp_with_params experiment scores...') + logger.info('Loading omp_no_weights experiment scores...') omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2], weights=False) + args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False) """# base_with_params logger.info('Loading base_with_params experiment scores 2...') @@ -402,47 +403,63 @@ if __name__ == "__main__": title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) elif args.stage == 5: # Retreive the extracted forest sizes number used in order to have a base forest axis as long as necessary - extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, args.experiment_ids[1]) + extracted_forest_sizes_number = retreive_extracted_forest_sizes_number(args.models_dir, int(args.experiment_ids[1])) + all_labels = list() + all_scores = list() # base_with_params logger.info('Loading base_with_params experiment scores...') base_with_params_train_scores, base_with_params_dev_scores, base_with_params_test_scores, \ base_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, args.experiment_ids[0], + extract_scores_across_seeds_and_forest_size(args.models_dir, args.results_dir, int(args.experiment_ids[0]), extracted_forest_sizes_number) # random_with_params logger.info('Loading random_with_params experiment scores...') random_with_params_train_scores, random_with_params_dev_scores, random_with_params_test_scores, \ with_params_extracted_forest_sizes, random_with_params_experiment_score_metric = \ - extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, args.experiment_ids[1]) + extract_scores_across_seeds_and_extracted_forest_sizes(args.models_dir, args.results_dir, int(args.experiment_ids[1])) # omp_with_params logger.info('Loading omp_with_params experiment scores...') omp_with_params_train_scores, omp_with_params_dev_scores, omp_with_params_test_scores, _, \ omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[2]) - # omp_with_params - logger.info('Loading kmeans_with_params experiment scores...') - kmeans_with_params_train_scores, kmeans_with_params_dev_scores, kmeans_with_params_test_scores, _, \ - kmeans_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( - args.models_dir, args.results_dir, args.experiment_ids[3]) - - # Sanity check on the metrics retreived - if not (base_with_params_experiment_score_metric == random_with_params_experiment_score_metric - == omp_with_params_experiment_score_metric == kmeans_with_params_experiment_score_metric): - raise ValueError('Score metrics of all experiments must be the same.') - experiments_score_metric = base_with_params_experiment_score_metric + args.models_dir, args.results_dir, int(args.experiment_ids[2])) + #omp_with_params_without_weights + logger.info('Loading omp_with_params experiment scores...') + omp_with_params_without_weights_train_scores, omp_with_params_without_weights_dev_scores, omp_with_params_without_weights_test_scores, _, \ + omp_with_params_experiment_score_metric = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, int(args.experiment_ids[2]), weights=False) + + all_labels = ['base', 'random', 'omp', 'omp_without_weights'] + all_scores = [base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, + omp_with_params_without_weights_test_scores] + + for i in range(3, len(args.experiment_ids)): + if 'kmeans' in args.experiment_ids[i]: + label = 'kmeans' + elif 'similarity' in args.experiment_ids[i]: + label = 'similarity' + elif 'ensemble' in args.experiment_ids[i]: + label = 'ensemble' + else: + logger.error('Invalid value encountered') + continue - output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5_kmeans') + logger.info(f'Loading {label} experiment scores...') + _, _, current_test_scores, _, _ = extract_scores_across_seeds_and_extracted_forest_sizes( + args.models_dir, args.results_dir, int(args.experiment_ids[i].split('=')[1])) + all_labels.append(label) + all_scores.append(current_test_scores) + + output_path = os.path.join(args.results_dir, args.dataset_name, 'stage5') pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) Plotter.plot_stage2_losses( - file_path=output_path + os.sep + 'losses.png', - all_experiment_scores=[base_with_params_test_scores, random_with_params_test_scores, omp_with_params_test_scores, - kmeans_with_params_test_scores], - all_labels=['base', 'random', 'omp', 'kmeans'], + file_path=output_path + os.sep + f"losses_{'-'.join(all_labels)}.png", + all_experiment_scores=all_scores, + all_labels=all_labels, x_value=with_params_extracted_forest_sizes, xlabel='Number of trees extracted', - ylabel=experiments_score_metric, + ylabel=base_with_params_experiment_score_metric, title='Loss values of {}\nusing best params of previous stages'.format(args.dataset_name)) else: raise ValueError('This stage number is not supported yet, but it will be!') diff --git a/code/train.py b/code/train.py index 1d75e98b9044165abb075a346761a910d8479a83..8e48e14009dff51ed92d7baba7b49760146347a9 100644 --- a/code/train.py +++ b/code/train.py @@ -2,6 +2,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.dataset_loader import DatasetLoader from bolsonaro.models.model_factory import ModelFactory from bolsonaro.models.model_parameters import ModelParameters +from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor from bolsonaro.trainer import Trainer from bolsonaro.utils import resolve_experiment_id, tqdm_joblib from bolsonaro import LOG_PATH @@ -9,6 +10,7 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory from dotenv import find_dotenv, load_dotenv import argparse +import copy import json import pathlib import random @@ -53,10 +55,37 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb trainer = Trainer(dataset) + if parameters['extraction_strategy'] == 'ensemble': + library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed) + else: + library = None + + if parameters['extraction_strategy'] == 'random': + pretrained_model_parameters = ModelParameters( + extracted_forest_size=parameters['forest_size'], + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters, + extraction_strategy=parameters['extraction_strategy'] + ) + pretrained_estimator = ModelFactory.build(dataset.task, pretrained_model_parameters, library=library) + pretraned_trainer = Trainer(dataset) + pretraned_trainer.init(pretrained_estimator, subsets_used=parameters['subsets_used']) + pretrained_estimator.fit( + X=pretraned_trainer._X_forest, + y=pretraned_trainer._y_forest + ) + else: + pretrained_estimator = None + pretrained_model_parameters = None + if parameters['extraction_strategy'] != 'none': with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb: Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i], - models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer) + models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, + pretrained_estimator=pretrained_estimator, pretrained_model_parameters=pretrained_model_parameters) for i in range(len(parameters['extracted_forest_size']))) else: forest_size = hyperparameters['n_estimators'] @@ -88,7 +117,7 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb ) model_parameters.save(sub_models_dir, experiment_id) - model = ModelFactory.build(dataset.task, model_parameters) + model = ModelFactory.build(dataset.task, model_parameters, library=library) trainer.init(model, subsets_used=parameters['subsets_used']) trainer.train(model) @@ -97,7 +126,8 @@ def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verb seed_job_pb.update(1) def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir, - seed, parameters, dataset, hyperparameters, experiment_id, trainer): + seed, parameters, dataset, hyperparameters, experiment_id, trainer, library, + pretrained_estimator=None, pretrained_model_parameters=None): logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format( seed, extracted_forest_size, threading.get_ident())) @@ -121,21 +151,24 @@ def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_siz pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) - model_parameters = ModelParameters( - extracted_forest_size=extracted_forest_size, - normalize_D=parameters['normalize_D'], - subsets_used=parameters['subsets_used'], - normalize_weights=parameters['normalize_weights'], - seed=seed, - hyperparameters=hyperparameters, - extraction_strategy=parameters['extraction_strategy'] - ) - model_parameters.save(sub_models_dir, experiment_id) - - model = ModelFactory.build(dataset.task, model_parameters) + if not pretrained_estimator: + model_parameters = ModelParameters( + extracted_forest_size=extracted_forest_size, + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], + seed=seed, + hyperparameters=hyperparameters, + extraction_strategy=parameters['extraction_strategy'] + ) + model_parameters.save(sub_models_dir, experiment_id) + model = ModelFactory.build(dataset.task, model_parameters, library=library) + else: + model = copy.deepcopy(pretrained_estimator) + pretrained_model_parameters.save(sub_models_dir, experiment_id) trainer.init(model, subsets_used=parameters['subsets_used']) - trainer.train(model) + trainer.train(model, extracted_forest_size=extracted_forest_size) trainer.compute_results(model, sub_models_dir) """ @@ -202,7 +235,7 @@ if __name__ == "__main__": parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') - parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.') + parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans, ensemble.') parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id') args = parser.parse_args() @@ -213,7 +246,7 @@ if __name__ == "__main__": else: parameters = args.__dict__ - if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']: + if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans', 'ensemble']: raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy)) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) diff --git a/experiments/boston/stage4/none_with_params.json b/experiments/boston/stage4/none_with_params.json index ba056e193480a5888e1e41dac30f5cbcf4cc4870..b0c35cde9b71156dbbcd15c17292317bd99ce228 100644 --- a/experiments/boston/stage4/none_with_params.json +++ b/experiments/boston/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/boston/stage4/omp_with_params.json b/experiments/boston/stage4/omp_with_params.json index 0c393f6eb57537b70b7fac49d903d14e85ab8426..196f0357f3dc26117bc104742bc12d1a5a370982 100644 --- a/experiments/boston/stage4/omp_with_params.json +++ b/experiments/boston/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/boston/stage4/random_with_params.json b/experiments/boston/stage4/random_with_params.json index 0258396e63720a5bc32ddcf0884b0f6e93f03a0f..d9c83086386da7ee1de79f142f8ffaefabf9757b 100644 --- a/experiments/boston/stage4/random_with_params.json +++ b/experiments/boston/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/breast_cancer/stage4/none_with_params.json b/experiments/breast_cancer/stage4/none_with_params.json index 4b4d46e79ce17838f0944879be812a314db056ea..03fb21ff9af2428ec39c7ba4f718210ff3806b62 100644 --- a/experiments/breast_cancer/stage4/none_with_params.json +++ b/experiments/breast_cancer/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/breast_cancer/stage4/omp_with_params.json b/experiments/breast_cancer/stage4/omp_with_params.json index 287b55e77bed5521f49792397e37ed2d3c25c81f..28a56abac7a603b39ec78f45a2125b2c6869caa7 100644 --- a/experiments/breast_cancer/stage4/omp_with_params.json +++ b/experiments/breast_cancer/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/breast_cancer/stage4/random_with_params.json b/experiments/breast_cancer/stage4/random_with_params.json index 34f70d8e4f2a3648226a0fefe330751ad585ea90..597798d60fb344ca8ac3765f38d58d3e66d8df84 100644 --- a/experiments/breast_cancer/stage4/random_with_params.json +++ b/experiments/breast_cancer/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/california_housing/stage4/none_with_params.json b/experiments/california_housing/stage4/none_with_params.json index e3549c1bebe94975ee7b774968e9f98f3d346625..83eb6dd0c9df8827550488e2e9f11ed7a6052a48 100644 --- a/experiments/california_housing/stage4/none_with_params.json +++ b/experiments/california_housing/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/california_housing/stage4/omp_with_params.json b/experiments/california_housing/stage4/omp_with_params.json index e01103a853e04a4775321e7ede4130f4b82aa44f..5ae422948af9317be83afb75a85a6e0ba70220d9 100644 --- a/experiments/california_housing/stage4/omp_with_params.json +++ b/experiments/california_housing/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/california_housing/stage4/random_with_params.json b/experiments/california_housing/stage4/random_with_params.json index 85dbd63e39fc8d885fef3d994335601beb584c45..ae6b06b65a0f6dedb9197c6ede2bccc52bc40d93 100644 --- a/experiments/california_housing/stage4/random_with_params.json +++ b/experiments/california_housing/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/diabetes/stage4/none_with_params.json b/experiments/diabetes/stage4/none_with_params.json index 4f89e2e6edb5a8af7dfc02cf42067e386c6d6c11..fc22d5649b4647a4ff5d900bb5f19c83df2386f7 100644 --- a/experiments/diabetes/stage4/none_with_params.json +++ b/experiments/diabetes/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 4, 7, diff --git a/experiments/diabetes/stage4/omp_with_params.json b/experiments/diabetes/stage4/omp_with_params.json index c0a85a56e816c536ddf870a8dc19955df383cf5c..1cd7c3ba25dd4d921a8673e31286c44e367084c6 100644 --- a/experiments/diabetes/stage4/omp_with_params.json +++ b/experiments/diabetes/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 4, 7, diff --git a/experiments/diabetes/stage4/random_with_params.json b/experiments/diabetes/stage4/random_with_params.json index 6684ed251b8ecc4ef36c460d6f4b7568fb80b4a2..84bd44ea6927913c59c90d05e3bd36455168d7fe 100644 --- a/experiments/diabetes/stage4/random_with_params.json +++ b/experiments/diabetes/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 4, 7, diff --git a/experiments/diamonds/stage4/none_with_params.json b/experiments/diamonds/stage4/none_with_params.json index fedf40d3bc0f2c8a15ab352f656181031f2a1ac2..53df0124fd7cc4429063d6ef7c4a382604c1dfc1 100644 --- a/experiments/diamonds/stage4/none_with_params.json +++ b/experiments/diamonds/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 14, 29, diff --git a/experiments/diamonds/stage4/omp_with_params.json b/experiments/diamonds/stage4/omp_with_params.json index 52d8cf794c2d8972c64a052a854f89e50e3ef8fd..002ca265903adf0a7b6d4a5e91415ca85c1eef09 100644 --- a/experiments/diamonds/stage4/omp_with_params.json +++ b/experiments/diamonds/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 14, 29, diff --git a/experiments/diamonds/stage4/random_with_params.json b/experiments/diamonds/stage4/random_with_params.json index 9089cbb45ac3b8684cebc5498f77df8551067cb7..c83a685bdd40d959c3b9323e6bff35dd9062242c 100644 --- a/experiments/diamonds/stage4/random_with_params.json +++ b/experiments/diamonds/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 14, 29, diff --git a/experiments/digits/stage4/none_with_params.json b/experiments/digits/stage4/none_with_params.json index 599a191e530fe89d207cc865467b6d9d0266703b..76846a396a8c63d39dff3fd311e755bc58caa6f4 100644 --- a/experiments/digits/stage4/none_with_params.json +++ b/experiments/digits/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/digits/stage4/omp_with_params.json b/experiments/digits/stage4/omp_with_params.json index 45d968b70746ffe967924dbf3f1bc6e4f7b1a174..db869b37dbfc39afbc0979ff4680e3ea77d7798e 100644 --- a/experiments/digits/stage4/omp_with_params.json +++ b/experiments/digits/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/digits/stage4/random_with_params.json b/experiments/digits/stage4/random_with_params.json index e96b311ccbd5562e2e5f69a3bfe13cf9e361cd15..48b67e40fbc139a42e52b7b79b612bcf2a93cde4 100644 --- a/experiments/digits/stage4/random_with_params.json +++ b/experiments/digits/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/iris/stage4/none_with_params.json b/experiments/iris/stage4/none_with_params.json index 2df5617ab81106f92221d1693631e734ccfd27b5..fbe10241ba87651a44f28f65a5b12b30a5924a88 100644 --- a/experiments/iris/stage4/none_with_params.json +++ b/experiments/iris/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/iris/stage4/omp_with_params.json b/experiments/iris/stage4/omp_with_params.json index ffddfac5bef952c2f1f79dad260952ec317505a8..ad1e34385529c56a7e15d42cf8c9595156ddc35b 100644 --- a/experiments/iris/stage4/omp_with_params.json +++ b/experiments/iris/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/iris/stage4/random_with_params.json b/experiments/iris/stage4/random_with_params.json index c50daa86d3880ffd0cdc022de37060d32319cc4b..31acb3927fa9e6a68476ce1c96406a885f9c4f60 100644 --- a/experiments/iris/stage4/random_with_params.json +++ b/experiments/iris/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/kin8nm/stage1/params.json b/experiments/kin8nm/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..aff3e00221518c26e5cb8860c5928acc5aaaa774 --- /dev/null +++ b/experiments/kin8nm/stage1/params.json @@ -0,0 +1,28 @@ +{ + "scorer": "neg_mean_squared_error", + "best_score_train": -0.022941874369141916, + "best_score_test": -0.020444215502079355, + "best_parameters": { + "min_samples_leaf": 1, + "n_estimators": 1000, + "max_depth": 20, + "max_features": "auto" + }, + "random_seed": [ + 486, + 138, + 137, + 54, + 1132, + 1528, + 1857, + 1528, + 13, + 1826, + 647, + 1704, + 469, + 101, + 804 + ] +} \ No newline at end of file diff --git a/experiments/kin8nm/stage4/none_with_params.json b/experiments/kin8nm/stage4/none_with_params.json index 95f0f54bf549c1433fa61adc0e22677220b8cbbc..db480f954910c3ac6b3f353a1c12459dd3b674ab 100644 --- a/experiments/kin8nm/stage4/none_with_params.json +++ b/experiments/kin8nm/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/kin8nm/stage4/omp_with_params.json b/experiments/kin8nm/stage4/omp_with_params.json index 03136cb595d5ca173af14d8cebaf5b652dd56691..d365b23f2a8310f5fc9c5194c88abecaf07d0de5 100644 --- a/experiments/kin8nm/stage4/omp_with_params.json +++ b/experiments/kin8nm/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/kin8nm/stage4/random_with_params.json b/experiments/kin8nm/stage4/random_with_params.json index 18a553f400a587b85aaacf89ecd61df6a39efb74..15034117953dfee37b74ed6030ae8cd3b042c832 100644 --- a/experiments/kin8nm/stage4/random_with_params.json +++ b/experiments/kin8nm/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/kr-vs-kp/stage4/none_with_params.json b/experiments/kr-vs-kp/stage4/none_with_params.json index cd0bbd7ef063a4bdf94659eb5957851d2c03a3bc..c3ed6c0c88e8380bd9d1cc777074c8fd94d9ac03 100644 --- a/experiments/kr-vs-kp/stage4/none_with_params.json +++ b/experiments/kr-vs-kp/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/kr-vs-kp/stage4/omp_with_params.json b/experiments/kr-vs-kp/stage4/omp_with_params.json index ac4d8c6cbf437224a1477c1c507cf1c9d93bba67..1be3d0d8989f196827fd607b85c7dcbfe6a1c6c7 100644 --- a/experiments/kr-vs-kp/stage4/omp_with_params.json +++ b/experiments/kr-vs-kp/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/kr-vs-kp/stage4/random_with_params.json b/experiments/kr-vs-kp/stage4/random_with_params.json index f471871756111a14772935b93ae0de9d4500ceeb..08fec7e1b62961b48593706efe7593c56622151c 100644 --- a/experiments/kr-vs-kp/stage4/random_with_params.json +++ b/experiments/kr-vs-kp/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/olivetti_faces/stage4/none_with_params.json b/experiments/olivetti_faces/stage4/none_with_params.json index 3fca3b0e932ec0e544fa30ed34655668c9613c8f..e7faf2ba86f8da193b094868d6612db03a5438e9 100644 --- a/experiments/olivetti_faces/stage4/none_with_params.json +++ b/experiments/olivetti_faces/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/olivetti_faces/stage4/omp_with_params.json b/experiments/olivetti_faces/stage4/omp_with_params.json index 136133d47e627cf3b3122ba4e8bf3e55e0d8bd26..7f576fffd0c784097b62d65eaba78f68d09db04d 100644 --- a/experiments/olivetti_faces/stage4/omp_with_params.json +++ b/experiments/olivetti_faces/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/olivetti_faces/stage4/random_with_params.json b/experiments/olivetti_faces/stage4/random_with_params.json index 2a2ab9aab2a566494b2bebf82af433e734691049..7ced47f845b970c8a04bb997f77d2aa784815508 100644 --- a/experiments/olivetti_faces/stage4/random_with_params.json +++ b/experiments/olivetti_faces/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/steel-plates/stage4/none_with_params.json b/experiments/steel-plates/stage4/none_with_params.json index 8f4b9a01ee10ff46ea4d7a23cf5a9525abfb734d..33db06f965e5e03ceec17370530094532a87f6af 100644 --- a/experiments/steel-plates/stage4/none_with_params.json +++ b/experiments/steel-plates/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/steel-plates/stage4/omp_with_params.json b/experiments/steel-plates/stage4/omp_with_params.json index ffaa72fc741b2b6b841bde18786b677ba37766a7..70882f749e0d3814ec32fe39ac26a8ad98d8a930 100644 --- a/experiments/steel-plates/stage4/omp_with_params.json +++ b/experiments/steel-plates/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/steel-plates/stage4/random_with_params.json b/experiments/steel-plates/stage4/random_with_params.json index a86d432a4156624749dca5f90d2d016fb4c45549..4838e4f26d35366a0fa8a1eee71b8a165800222e 100644 --- a/experiments/steel-plates/stage4/random_with_params.json +++ b/experiments/steel-plates/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/wine/stage4/none_with_params.json b/experiments/wine/stage4/none_with_params.json index 2621fec400b6069c5d253e99505c96c6f9bd102e..e5a882f07f4190ec7306e956a99322f423d89fd4 100644 --- a/experiments/wine/stage4/none_with_params.json +++ b/experiments/wine/stage4/none_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "none", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/wine/stage4/omp_with_params.json b/experiments/wine/stage4/omp_with_params.json index a5ff2576ac94b36a51e3501bf83cb47b0a88a483..94f22fc1c25568797128035dbd5539cf780b3f75 100644 --- a/experiments/wine/stage4/omp_with_params.json +++ b/experiments/wine/stage4/omp_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "omp", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/experiments/wine/stage4/random_with_params.json b/experiments/wine/stage4/random_with_params.json index f344a1059e6ce3bfd458efee55b3809d93788458..43442b15fc8c0188a2c2410768f546093d4dee1a 100644 --- a/experiments/wine/stage4/random_with_params.json +++ b/experiments/wine/stage4/random_with_params.json @@ -17,9 +17,7 @@ 2, 3, 4, - 5, - 6, - 7 + 5 ], "subsets_used": "train+dev,train+dev", "normalize_weights": false, @@ -31,6 +29,7 @@ ], "job_number": -1, "extraction_strategy": "random", + "overwrite": true, "extracted_forest_size": [ 33, 67, diff --git a/results/boston/stage4_fix/losses.png b/results/boston/stage4_fix/losses.png index aa5577274c1a5522a05d0cf025de25ef2e027aa4..4affb9e65aaaf73f61c9d8467e00b0a07ce44ead 100644 Binary files a/results/boston/stage4_fix/losses.png and b/results/boston/stage4_fix/losses.png differ diff --git a/results/breast_cancer/stage4_fix/losses.png b/results/breast_cancer/stage4_fix/losses.png index f0062906b6b0ec3007a207c89ac3610166bfc429..90751761668263c2d33b20c319755f565f40b57e 100644 Binary files a/results/breast_cancer/stage4_fix/losses.png and b/results/breast_cancer/stage4_fix/losses.png differ diff --git a/results/california_housing/stage4_fix/losses.png b/results/california_housing/stage4_fix/losses.png index dccb6d4883f3f3f56ecab982cdd61451ee0d3f70..7f7d7d89864bb61384dc18c9ff97d9ebd433146c 100644 Binary files a/results/california_housing/stage4_fix/losses.png and b/results/california_housing/stage4_fix/losses.png differ diff --git a/results/digits/stage4_fix/losses.png b/results/digits/stage4_fix/losses.png index ab54aa24b5347434be31130026c4ab451f3f850b..bb2745c8511a4a0ceaa25b6d090d82fa07e1c3eb 100644 Binary files a/results/digits/stage4_fix/losses.png and b/results/digits/stage4_fix/losses.png differ diff --git a/results/iris/stage4_fix/losses.png b/results/iris/stage4_fix/losses.png index 79d5af7f9b398493e3f6fb8b9bde135f7cd0eceb..13d3efea902654699b23f049214d060109b0fe45 100644 Binary files a/results/iris/stage4_fix/losses.png and b/results/iris/stage4_fix/losses.png differ diff --git a/results/kin8nm/stage4_fix/losses.png b/results/kin8nm/stage4_fix/losses.png index 82b96dc80d46d04fbfc7f56b4b18e3b999af2dc9..2671dda61a4ce85e129315a017b89000ffc6da2f 100644 Binary files a/results/kin8nm/stage4_fix/losses.png and b/results/kin8nm/stage4_fix/losses.png differ diff --git a/results/kr-vs-kp/stage4_fix/losses.png b/results/kr-vs-kp/stage4_fix/losses.png index 295409f42f566b7cd213f4ee132462ffcd82c2b6..af21a247a3c578952241fe72434537966247210f 100644 Binary files a/results/kr-vs-kp/stage4_fix/losses.png and b/results/kr-vs-kp/stage4_fix/losses.png differ diff --git a/results/olivetti_faces/stage4_fix/losses.png b/results/olivetti_faces/stage4_fix/losses.png index 76d53a46b023f3e109511fd0f0db6e4eebe1d16b..8cb42056500fbc7f10cae168fb8deecb7320dcf6 100644 Binary files a/results/olivetti_faces/stage4_fix/losses.png and b/results/olivetti_faces/stage4_fix/losses.png differ diff --git a/results/spambase/stage4_fix/losses.png b/results/spambase/stage4_fix/losses.png index 42201f37548eb0869d2e25e63e1bd2db6a9df0f1..5aee0e0268e9cd48c8e596af92424f661fe1cef8 100644 Binary files a/results/spambase/stage4_fix/losses.png and b/results/spambase/stage4_fix/losses.png differ diff --git a/results/steel-plates/stage4_fix/losses.png b/results/steel-plates/stage4_fix/losses.png index 7a7b181d7b537b447831a26e397df49c3f6dc3c2..7c91887015d74e04d3469c603074ec4953124763 100644 Binary files a/results/steel-plates/stage4_fix/losses.png and b/results/steel-plates/stage4_fix/losses.png differ diff --git a/results/wine/stage4_fix/losses.png b/results/wine/stage4_fix/losses.png index 31286863e58fdc3fe40f5b77cf07df3c21a572fd..4231a030e79e1741b0fe205e95cc4c1927e1ad4f 100644 Binary files a/results/wine/stage4_fix/losses.png and b/results/wine/stage4_fix/losses.png differ diff --git a/run_stage5_experiments.sh b/run_stage5_experiments.sh new file mode 100755 index 0000000000000000000000000000000000000000..a28cdf0f8b2f447eb5fcfcf0e7b0ee914ec0877b --- /dev/null +++ b/run_stage5_experiments.sh @@ -0,0 +1,14 @@ +#!/bin/bash +core_number=1 +walltime=5:00 +seeds='1 2 3' + +for dataset in kin8nm kr-vs-kp spambase steel-plates diabetes diamonds boston california_housing +do + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=50,walltime=5:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=50,walltime=5:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=kmeans --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=5 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=50,walltime=5:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=ensemble --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=6 --models_dir=models/$dataset/stage5 --subsets_used train+dev,train+dev" +done diff --git a/scripts/run_compute_results.sh b/scripts/run_compute_results.sh index 1093c02c120a61354b946077ecc8ea9b93c93319..d82e2638f910066d07fa1b87d065c804c7e8eb20 100644 --- a/scripts/run_compute_results.sh +++ b/scripts/run_compute_results.sh @@ -1,4 +1,5 @@ -for dataset in california_housing +for dataset in breast_cancer diabetes diamonds boston steel-plates kr-vs-kp digits iris kin8nm olivetti_faces wine spambase +#for dataset in california_housing #for dataset in breast_cancer diabetes diamonds california_housing boston linnerud steel-plates kr-vs-kp digits iris kin8nm olivetti_faces wine spambase do python code/compute_results.py --stage=4 --experiment_ids 1 2 3 --dataset_name=$dataset --models_dir=models/$dataset/stage4 diff --git a/scripts/run_stage4_experiments.sh b/scripts/run_stage4_experiments.sh index 71987fbf9005bc4cd60f32eb8645b8c26e702705..751bf24d68ceb5d42c02d2c5965596af7022b032 100644 --- a/scripts/run_stage4_experiments.sh +++ b/scripts/run_stage4_experiments.sh @@ -1,14 +1,14 @@ #!/bin/bash -core_number=14 +core_number=5 walltime=1:00 -seeds='1 2 3 4 5 6 7' +seeds='1 2 3 4 5' -for dataset in california_housing +#for dataset in california_housing #for dataset in kin8nm #for dataset in gamma -#for dataset in breast_cancer diabetes diamonds california_housing boston linnerud steel-plates kr-vs-kp digits iris kin8nm lfw_pairs linnerud olivetti_faces wine spambase gamma +for dataset in breast_cancer diabetes diamonds boston steel-plates kr-vs-kp digits iris kin8nm olivetti_faces wine spambase do - oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=1 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage4 --subsets_used train+dev,train+dev" - oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --save_experiment_configuration 4 random_with_params --extracted_forest_size_stop=1 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage4 --subsets_used train+dev,train+dev" - oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --save_experiment_configuration 4 omp_with_params --extracted_forest_size_stop=1 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage4 --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -n "$dataset base" -l /core=1,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=1 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage4 --subsets_used train+dev,train+dev --overwrite" + oarsub -p "(gpu is null)" -n "$dataset random" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --save_experiment_configuration 4 random_with_params --extracted_forest_size_stop=1 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage4 --subsets_used train+dev,train+dev --overwrite" + oarsub -p "(gpu is null)" -n "$dataset omp" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --save_experiment_configuration 4 omp_with_params --extracted_forest_size_stop=1 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage4 --subsets_used train+dev,train+dev --overwrite" done diff --git a/scripts/run_stage5_experiments_ensemble.sh b/scripts/run_stage5_experiments_ensemble.sh new file mode 100755 index 0000000000000000000000000000000000000000..7387a97abae531c34a47a038ccc964a787eebcfe --- /dev/null +++ b/scripts/run_stage5_experiments_ensemble.sh @@ -0,0 +1,12 @@ +#!/bin/bash +core_number=1 +walltime=5:00 +seeds='1 2 3' + +for dataset in california_housing # kin8nm kr-vs-kp spambase steel-plates diabetes diamonds boston california_housing +do + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" + #oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=ensemble --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5_ensemble --subsets_used train+dev,train+dev" +done diff --git a/scripts/run_stage5_experiments_kmeans.sh b/scripts/run_stage5_experiments_kmeans.sh new file mode 100755 index 0000000000000000000000000000000000000000..b5beae69ef14b4b75bffdb9fcd196a96b1488c8e --- /dev/null +++ b/scripts/run_stage5_experiments_kmeans.sh @@ -0,0 +1,12 @@ +#!/bin/bash +core_number=50 +walltime=5:00 +seeds='1 2 3' + +for dataset in diabetes diamonds boston california_housing +do + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5_kmeans --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5_kmeans --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5_kmeans --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=50,walltime=5:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=kmeans --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5_kmeans --subsets_used train+dev,train+dev" +done diff --git a/scripts/run_stage5_experiments_similarity.sh b/scripts/run_stage5_experiments_similarity.sh new file mode 100755 index 0000000000000000000000000000000000000000..d7be4e984dec0a216cb7c2db539cc6595cfdb110 --- /dev/null +++ b/scripts/run_stage5_experiments_similarity.sh @@ -0,0 +1,12 @@ +#!/bin/bash +core_number=1 +walltime=5:00 +seeds='1 2 3' + +for dataset in diabetes diamonds boston california_housing +do + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=1 --models_dir=models/$dataset/stage5_similarity --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=2 --models_dir=models/$dataset/stage5_similarity --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=5,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=omp --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=3 --models_dir=models/$dataset/stage5_similarity --subsets_used train+dev,train+dev" + oarsub -p "(gpu is null)" -l /core=50,walltime=5:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=similarity --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=30 --experiment_id=4 --models_dir=models/$dataset/stage5_similarity --subsets_used train+dev,train+dev" +done