diff --git a/TODO.md b/TODO.md index 5ea6cc5cf2c933eed2e7ffbf2567d4fe812412cf..b94e576024a5294b6eaffaaf1af5003f0e034313 100644 --- a/TODO.md +++ b/TODO.md @@ -1,7 +1,3 @@ -* Fix pickle loading of ModelRawResults, because saving the model_object leads import issues. -* Fix ModelFactory.load function. * Fix model results loading in compute_results.py. * Check that omp multiclasses classifier is working as expected. -* In the bayesian search computation, output a different file name depending on the task of the trained model. -* Check the best params scores of the regressors (neg_mean_squared_error leads to huge negative values). -* Prepare the json experiment files to run. \ No newline at end of file +* Fix the dataset error of fetcher when job_number > 1. \ No newline at end of file diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index 01d71e2d650aff597b7cb6edb8de6d62f6b419cb..8ffbc76b145ffddcf7e63871864471efc76c9737 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -19,7 +19,7 @@ class DatasetLoader(object): DEFAULT_NORMALIZE_D = False DEFAULT_DATASET_NORMALIZER = 'standard' DEFAULT_FOREST_SIZE = 100 - DEFAULT_EXTRACTED_FOREST_SIZE = 10 + DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 4 DEFAULT_DEV_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 diff --git a/code/bolsonaro/utils.py b/code/bolsonaro/utils.py index 9dff06ac65a726642cc5efe2e6ed8b8f78f40b29..797f3005c97099cb88ae81f59178310f6c078685 100644 --- a/code/bolsonaro/utils.py +++ b/code/bolsonaro/utils.py @@ -79,7 +79,6 @@ def change_binary_func_load(base_load_function): return X, y return func_load - @contextlib.contextmanager def tqdm_joblib(tqdm_object): """Context manager to patch joblib to report into tqdm progress bar given as argument""" @@ -100,3 +99,17 @@ def tqdm_joblib(tqdm_object): finally: joblib.parallel.BatchCompletionCallBack = old_batch_callback tqdm_object.close() + +def is_int(value): + try: + int(value) + return True + except ValueError: + return False + +def is_float(value): + try: + float(value) + return True + except ValueError: + return False diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py index 548a1a4e82c720b711859abd57b0ddcf295e5835..9135c00d1ec0f86256881952d67c8cc86a62fe5a 100644 --- a/code/compute_hyperparameters.py +++ b/code/compute_hyperparameters.py @@ -4,7 +4,7 @@ from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.task import Task from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.hyperparameter_searcher import HyperparameterSearcher -from bolsonaro.utils import save_obj_to_json, tqdm_joblib +from bolsonaro.utils import save_obj_to_json, tqdm_joblib, is_int, is_float import argparse import os @@ -68,7 +68,7 @@ def process_job(dataset_name, seed, param_space, args): def run_hyperparameter_search_jobs(seeds, dataset_name, param_space, args): # Run one hyperparameter search job per seed with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar: - opt_results = Parallel(n_jobs=-1)(delayed(process_job)( + opt_results = Parallel(n_jobs=args.job_number)(delayed(process_job)( dataset_name, seeds[i], param_space, args) for i in range(len(seeds))) return opt_results @@ -108,6 +108,10 @@ def compute_best_params_over_seeds(seeds, dataset_name, param_space, args): split = element.split('_') param, value = '_'.join(split[:-1]), split[-1] if param not in best_params: + if is_int(value): + value = int(value) + elif is_float(value): + value = float(value) best_params[param] = value if len(best_params) == len(all_param_names): break @@ -128,6 +132,7 @@ if __name__ == "__main__": DEFAULT_CV = 3 DEFAULT_N_ITER = 50 DEFAULT_VERBOSE = False + DEFAULT_JOB_NUMBER = -1 DICT_PARAM_SPACE = {'n_estimators': Integer(10, 1000), 'min_samples_leaf': Integer(1, 1000), 'max_depth': Integer(1, 20), @@ -144,6 +149,7 @@ if __name__ == "__main__": parser.add_argument('--use_variable_seed_number', action='store_true', default=DEFAULT_USE_VARIABLE_SEED_NUMBER, help='Compute the amount of random seeds depending on the dataset.') parser.add_argument('--datasets', nargs='+', type=str, default=DatasetLoader.dataset_names, help='Specify the dataset used by the estimator.') parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.') + parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') args = parser.parse_args() logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) diff --git a/code/compute_results.py b/code/compute_results.py index 64124af70954cc6af6a923f03f5a122a75f453fb..5e65a775cac80b8b2262a69431312a278dad9340 100644 --- a/code/compute_results.py +++ b/code/compute_results.py @@ -16,147 +16,31 @@ if __name__ == "__main__": DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' - DEFAULT_EXPERIMENT_IDS = None parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--stage_number', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].') + parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') - parser.add_argument('--experiment_ids', nargs='+', type=int, default=DEFAULT_EXPERIMENT_IDS, help='Compute the results of the specified experiment id(s)') args = parser.parse_args() + if int(args.stage_number) not in list(range(1, 5)): + raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).') + # Create recursively the results dir tree pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) - """ - Use specified list of experiments ids if availabe. - Otherwise, list all existing experiment ids from - the specified models directory. - """ - experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \ - if args.experiment_ids is not None \ - else os.listdir(args.models_dir) + """ - Raise an error if there's no experiments ids found both - in parameter or in models directory. + TODO: + For each dataset: + Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams) + Stage 2) A figure for the selection of the best combination of normalization: D normalization vs weights normalization (4 combinations) + Stage 3) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev + Stage 4) A figure to finally compare the perf of our approach using the previous selected + parameters vs the baseline vs other papers using different extracted forest size + (percentage of the tree size found previously in best hyperparams search) on the abscissa. + + IMPORTANT: Compare experiments that used the same seeds among them (except for stage 1). """ - if experiments_ids is None or len(experiments_ids) == 0: - raise ValueError("No experiment id was found or specified.") - - # Compute the plots for each experiment id - for experiment_id in experiments_ids: - experiment_id_path = args.models_dir + os.sep + experiment_id # models/{experiment_id} - # Create recursively the tree results/{experiment_id} - pathlib.Path(args.results_dir + os.sep + experiment_id).mkdir(parents=True, exist_ok=True) - experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds - - """ - Dictionaries to temporarly store the scalar results with the following structure: - {seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]} - TODO: to complete to retreive more results - """ - experiment_train_scores = dict() - experiment_dev_scores = dict() - experiment_test_scores = dict() - - experiment_weights = dict() - - # Used to check if all losses were computed using the same metric (it should be the case) - experiment_score_metrics = list() - - # For each seed results stored in models/{experiment_id}/seeds - for seed in os.listdir(experiment_seed_root_path): - experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed} - dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed - dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters - extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size - - # {{seed}:[]} - experiment_train_scores[seed] = list() - experiment_dev_scores[seed] = list() - experiment_test_scores[seed] = list() - - experiment_weights[seed] = list() - - # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size - extracted_forest_sizes = os.listdir(extracted_forest_size_root_path) - for extracted_forest_size in extracted_forest_sizes: - # models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size} - extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size - # Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file - model_raw_results = ModelRawResults.load(extracted_forest_size_path) - # Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle - model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results) - # Save temporarly some raw results (TODO: to complete to retreive more results) - # Save the scores - experiment_train_scores[seed].append(model_raw_results.train_score) - experiment_dev_scores[seed].append(model_raw_results.dev_score) - experiment_test_scores[seed].append(model_raw_results.test_score) - # Save the weights - experiment_weights[seed].append(model_raw_results.weights) - # Save the metric - experiment_score_metrics.append(model_raw_results.score_metric) - - if len(set(experiment_score_metrics)) > 1: - raise ValueError("The metrics used to compute the dev score aren't the same everytime") - - """ - Example of plot that just plots the losses computed - on the train, dev and test subsets using a trained - model, with the CI, and depending on the extracted - forest size. - """ - Plotter.plot_losses( - file_path=args.results_dir + os.sep + experiment_id + os.sep + 'losses.png', - all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores], - x_value=extracted_forest_sizes, - xlabel='Number of trees extracted', - ylabel=experiment_score_metrics[0], - all_labels=['train', 'dev', 'test'], - title='Loss values of the trained model' - ) - - """ - TODO: - For each dataset: - Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams) - Stage 2) A figure for the selection of the best dataset normalization method - Stage 3) A figure for the selection of the best combination of dataset: normalization vs D normalization vs weights normalization - Stage 4) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev - Stage 5) A figure for the selection of the best extracted forest size? - Stage 6) A figure to finally compare the perf of our approach using the previous selected parameters vs the baseline vs other papers - - Stage 3) - In all axis: - - untrained forest - - trained base forest (straight line cause it doesn't depend on the number of extracted trees) - - Axis 1: - - test with forest on train+dev and OMP on train+dev - - test with forest on train+dev and OMP on train+dev with dataset normalization - - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization - - test with forest on train+dev and OMP on train+dev with dataset normalization + weights normalization - - test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + weights normalization - - Axis 2: - - test with forest on train and OMP on dev - - test with forest on train and OMP on dev with dataset normalization - - test with forest on train and OMP on dev with dataset normalization + D normalization - - test with forest on train and OMP on dev with dataset normalization + weights normalization - - test with forest on train and OMP on dev with dataset normalization + D normalization + weights normalization - - Axis 3: - - test with forest on train and OMP train+dev - - test with forest on train and OMP train+dev with dataset normalization - - test with forest on train and OMP train+dev with dataset normalization + D normalization - - test with forest on train and OMP train+dev with dataset normalization + weights normalization - - test with forest on train and OMP train+dev with dataset normalization + D normalization + weights normalization - - IMPORTANT: Same seeds used in all axis. - """ - - # Plot the density of the weights - Plotter.weight_density( - file_path=args.results_dir + os.sep + experiment_id + os.sep + 'density_weight.png', - all_experiment_weights=experiment_weights - ) diff --git a/code/train.py b/code/train.py index 38f2887e16de5a0967500a304034b5934dcd4a5d..13216acda3a88b3718d765ac3a6f424efd28de76 100644 --- a/code/train.py +++ b/code/train.py @@ -17,6 +17,7 @@ from joblib import Parallel, delayed import threading import json from tqdm import tqdm +import numpy as np def process_job(seed, parameters, experiment_id, hyperparameters): @@ -82,6 +83,8 @@ if __name__ == "__main__": # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models' DEFAULT_VERBOSE = False + DEFAULT_SKIP_BEST_HYPERPARAMS = False + DEFAULT_JOB_NUMBER = -1 begin_random_seed_range = 1 end_random_seed_range = 2000 @@ -92,8 +95,8 @@ if __name__ == "__main__": parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') - parser.add_argument('--forest_size', nargs='?', type=int, default=DatasetLoader.DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') - parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') + parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.') + parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.') parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.') @@ -102,6 +105,9 @@ if __name__ == "__main__": parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.') + parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') + parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') + parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') args = parser.parse_args() if args.experiment_configuration: @@ -115,22 +121,31 @@ if __name__ == "__main__": logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) - # The number of tree to extract from forest (K) - parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \ - if type(parameters['extracted_forest_size']) == list \ - else [parameters['extracted_forest_size']] - hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json') - if os.path.exists(hyperparameters_path): + if os.path.exists(hyperparameters_path) and not args.skip_best_hyperparams: logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path)) with open(hyperparameters_path, 'r+') as file_hyperparameter: hyperparameters = json.load(file_hyperparameter)['best_parameters'] else: hyperparameters = {} - if parameters['forest_size'] is not None: + """ + First case: no best hyperparameters are specified and no forest_size parameter + specified in argument, so use the DEFAULT_FOREST_SIZE. + Second case: no matter if hyperparameters are specified, the forest_size parameter + will override it. + Third implicit case: use the number of estimators found in specified hyperparameters. + """ + if len(hyperparameters) == 0 and parameters['forest_size'] is None: + hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE + elif parameters['forest_size'] is not None: hyperparameters['n_estimators'] = parameters['forest_size'] + # The number of tree to extract from forest (K) + parameters['extracted_forest_size'] = [int(hyperparameters['n_estimators'] * coeff) \ + for coeff in np.linspace(0, 1, parameters['extracted_forest_size_samples'] + 1, + endpoint=False)[1:]] + if parameters['seeds'] != None and parameters['random_seed_number'] > 1: logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') @@ -142,15 +157,29 @@ if __name__ == "__main__": # Resolve the next experiment id number (last id + 1) experiment_id = resolve_experiment_id(parameters['models_dir']) logger.info('Experiment id: {}'.format(experiment_id)) + parameters['experiment_id'] = experiment_id """ If the experiment configuration isn't coming from an already existing file, save it to a json file to - keep trace of it. + keep trace of it (either a specified path, either in 'unnamed' dir.). """ if args.experiment_configuration is None: - with open(args.experiment_configuration_path + os.sep + 'unnamed' + os.sep + 'unnamed_{}.json'.format( - experiment_id), 'w') as output_file: + if args.save_experiment_configuration: + if len(args.save_experiment_configuration) != 2: + raise ValueError('save_experiment_configuration must have two parameters.') + elif int(args.save_experiment_configuration[0]) not in list(range(1, 5)): + raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 4]).') + output_experiment_configuration_path = os.path.join(args.experiment_configuration_path, + args.dataset_name, 'stage' + args.save_experiment_configuration[0], + args.save_experiment_configuration[1] + '_{}.json'.format( + experiment_id)) + else: + pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True) + output_experiment_configuration_path = os.path.join( + args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format( + experiment_id)) + with open(output_experiment_configuration_path, 'w') as output_file: json.dump( parameters, output_file, @@ -159,5 +188,5 @@ if __name__ == "__main__": # Run as much job as there are seeds with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar: - Parallel(n_jobs=-1)(delayed(process_job)(seeds[i], + Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i], parameters, experiment_id, hyperparameters) for i in range(len(seeds))) diff --git a/experiments/boston/stage1/params.json b/experiments/boston/stage1/params.json index f2f3abbe37d05ef6123ce819ebee4dfac2b254a7..8b530a1131567dc32b0e6274dbda45f143ce4cab 100644 --- a/experiments/boston/stage1/params.json +++ b/experiments/boston/stage1/params.json @@ -4,9 +4,9 @@ "best_score_test": -13.650326577972058, "best_parameters": { "max_features": "auto", - "min_samples_leaf": "1", - "max_depth": "20", - "n_estimators": "1000" + "min_samples_leaf": 1, + "max_depth": 20, + "n_estimators": 1000 }, "random_seed": [ 1812, diff --git a/experiments/breast_cancer/stage1/params.json b/experiments/breast_cancer/stage1/params.json index 43739c2db86556421d35a2064aa554b532a5b413..d2bca84adc1377209cdc03c17d8a3d965bf233da 100644 --- a/experiments/breast_cancer/stage1/params.json +++ b/experiments/breast_cancer/stage1/params.json @@ -3,9 +3,9 @@ "best_score_train": 0.9562271062271059, "best_score_test": 0.9514619883040936, "best_parameters": { - "max_depth": "20", - "min_samples_leaf": "1", - "n_estimators": "1000", + "max_depth": 20, + "min_samples_leaf": 1, + "n_estimators": 1000, "max_features": "log2" }, "random_seed": [ diff --git a/experiments/california_housing/stage1/params.json b/experiments/california_housing/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..617c93c1725070111e2ac5a0292ef450c389df3c --- /dev/null +++ b/experiments/california_housing/stage1/params.json @@ -0,0 +1,16 @@ +{ + "scorer": "neg_mean_squared_error", + "best_score_train": -0.2535049905518054, + "best_score_test": -0.24128661227361273, + "best_parameters": { + "max_features": "log2", + "min_samples_leaf": 1, + "n_estimators": 1000, + "max_depth": 18 + }, + "random_seed": [ + 1012, + 529, + 42 + ] +} \ No newline at end of file diff --git a/experiments/diabetes/stage1/params.json b/experiments/diabetes/stage1/params.json index 472e7382583edd3ce381470a7ba9902aff443f5d..2ade87a85a4ae7ca8a30eeea974173163e576848 100644 --- a/experiments/diabetes/stage1/params.json +++ b/experiments/diabetes/stage1/params.json @@ -4,9 +4,9 @@ "best_score_test": -3305.635542701523, "best_parameters": { "max_features": "auto", - "min_samples_leaf": "1", - "max_depth": "15", - "n_estimators": "108" + "min_samples_leaf": 1, + "max_depth": 15, + "n_estimators": 108 }, "random_seed": [ 661, diff --git a/experiments/digits/stage1/params.json b/experiments/digits/stage1/params.json index 2cf1c4b1f5cc91bda737fe2f68c81d37d2682f2a..845a1aec4ffdb77db0d9eda6ca712dbb80c5bfb9 100644 --- a/experiments/digits/stage1/params.json +++ b/experiments/digits/stage1/params.json @@ -4,9 +4,9 @@ "best_score_test": 0.9738888888888889, "best_parameters": { "max_features": "sqrt", - "min_samples_leaf": "1", - "n_estimators": "1000", - "max_depth": "20" + "min_samples_leaf": 1, + "n_estimators": 1000, + "max_depth": 20 }, "random_seed": [ 1, diff --git a/experiments/iris/stage1/params.json b/experiments/iris/stage1/params.json index a91c658dfdf58b7338807194dbbe1f01b70aa431..e3eacac482d4747992a15987eaaabc54ac23c13b 100644 --- a/experiments/iris/stage1/params.json +++ b/experiments/iris/stage1/params.json @@ -4,9 +4,9 @@ "best_score_test": 0.9155555555555556, "best_parameters": { "max_features": "sqrt", - "min_samples_leaf": "1", - "max_depth": "1", - "n_estimators": "1000" + "min_samples_leaf": 1, + "max_depth": 1, + "n_estimators": 1000 }, "random_seed": [ 771, diff --git a/experiments/iris/stage1/with_best_params_16.json b/experiments/iris/stage1/with_best_params_16.json new file mode 100644 index 0000000000000000000000000000000000000000..102999a98070b312cdb6c17d7767423fe4ab75b7 --- /dev/null +++ b/experiments/iris/stage1/with_best_params_16.json @@ -0,0 +1,36 @@ +{ + "experiment_configuration": null, + "experiment_configuration_path": "experiments", + "dataset_name": "iris", + "normalize_D": false, + "dataset_normalizer": "standard", + "forest_size": null, + "extracted_forest_size_samples": 4, + "models_dir": ".\\models", + "dev_size": 0.2, + "test_size": 0.2, + "random_seed_number": 1, + "seeds": [ + 1, + 2, + 3, + 4, + 5 + ], + "subsets_used": "train,dev", + "normalize_weights": false, + "verbose": false, + "skip_best_hyperparams": false, + "save_experiment_configuration": [ + "1", + "with_best_params" + ], + "job_number": -1, + "extracted_forest_size": [ + 200, + 400, + 600, + 800 + ], + "experiment_id": 16 +} \ No newline at end of file diff --git a/experiments/iris/stage1/wo_best_params_17.json b/experiments/iris/stage1/wo_best_params_17.json new file mode 100644 index 0000000000000000000000000000000000000000..0294d64c0f98e3399e1b76b544204b821196c2fb --- /dev/null +++ b/experiments/iris/stage1/wo_best_params_17.json @@ -0,0 +1,36 @@ +{ + "experiment_configuration": null, + "experiment_configuration_path": "experiments", + "dataset_name": "iris", + "normalize_D": false, + "dataset_normalizer": "standard", + "forest_size": null, + "extracted_forest_size_samples": 4, + "models_dir": ".\\models", + "dev_size": 0.2, + "test_size": 0.2, + "random_seed_number": 1, + "seeds": [ + 1, + 2, + 3, + 4, + 5 + ], + "subsets_used": "train,dev", + "normalize_weights": false, + "verbose": false, + "skip_best_hyperparams": true, + "save_experiment_configuration": [ + "1", + "wo_best_params" + ], + "job_number": -1, + "extracted_forest_size": [ + 20, + 40, + 60, + 80 + ], + "experiment_id": 17 +} \ No newline at end of file diff --git a/experiments/linnerud/stage1/params.json b/experiments/linnerud/stage1/params.json index a1573d389eb93336ada91ee31ac66a7de166cd33..7e45dc06e3b617f4fb98c6645b9d3a193ba9d44b 100644 --- a/experiments/linnerud/stage1/params.json +++ b/experiments/linnerud/stage1/params.json @@ -3,10 +3,10 @@ "best_score_train": -223.81438159498393, "best_score_test": -262.4415311793658, "best_parameters": { - "max_depth": "1", - "min_samples_leaf": "1", + "max_depth": 1, + "min_samples_leaf": 1, "max_features": "sqrt", - "n_estimators": "1000" + "n_estimators": 1000 }, "random_seed": [ 1109, diff --git a/experiments/olivetti_faces/stage1/params.json b/experiments/olivetti_faces/stage1/params.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d83bfd06cf6a9f18bb319538626b322f5b5b7b --- /dev/null +++ b/experiments/olivetti_faces/stage1/params.json @@ -0,0 +1,28 @@ +{ + "scorer": "accuracy", + "best_score_train": 0.8890625, + "best_score_test": 0.89, + "best_parameters": { + "max_features": "log2", + "min_samples_leaf": 1, + "n_estimators": 1000, + "max_depth": 18 + }, + "random_seed": [ + 899, + 249, + 1367, + 942, + 846, + 1576, + 285, + 839, + 1974, + 1216, + 540, + 1292, + 1642, + 712, + 1511 + ] +} \ No newline at end of file diff --git a/experiments/wine/stage1/params.json b/experiments/wine/stage1/params.json index 99795a6caf3b2c0639df4d0bc3306f0906193309..2ede59c2abaa9ee572d84cd256eedb226ac80773 100644 --- a/experiments/wine/stage1/params.json +++ b/experiments/wine/stage1/params.json @@ -3,9 +3,9 @@ "best_score_train": 0.9846607669616517, "best_score_test": 0.9796296296296295, "best_parameters": { - "max_depth": "20", - "min_samples_leaf": "1", - "n_estimators": "1000", + "max_depth": 20, + "min_samples_leaf": 1, + "n_estimators": 1000, "max_features": "log2" }, "random_seed": [