diff --git a/.gitignore b/.gitignore index ed07278aa03dbf293f143b22d927fa9f08876edb..8b7c046e4145459df5142ca6463e9ff8e9ab3848 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ models/* results/* +experiments/unnamed/ */.kile/* *.kilepr diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index bac38444f24d5ab67158789ed52d475bec0b985e..f7b6d92de841cff0571c48c069494d968c443103 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -1,4 +1,5 @@ from bolsonaro.data.dataset import Dataset +from bolsonaro.data.dataset_parameters import DatasetParameters from bolsonaro.data.task import Task from bolsonaro.utils import change_binary_func_load @@ -9,10 +10,22 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing from sklearn.model_selection import train_test_split from sklearn import preprocessing +import random class DatasetLoader(object): + DEFAULT_DATASET_NAME = 'boston' + DEFAULT_NORMALIZE_D = False + DEFAULT_DATASET_NORMALIZER = 'standard' + DEFAULT_FOREST_SIZE = 100 + DEFAULT_EXTRACTED_FOREST_SIZE = 10 + DEFAULT_DEV_SIZE = 0.2 + DEFAULT_TEST_SIZE = 0.2 + DEFAULT_RANDOM_SEED_NUMBER = 1 + DEFAULT_SUBSETS_USED = 'train,dev' + DEFAULT_NORMALIZE_WEIGHTS = False + @staticmethod def load(dataset_parameters): name = dataset_parameters.name @@ -67,7 +80,7 @@ class DatasetLoader(object): raise ValueError("Unsupported dataset '{}'".format(name)) if X is None: - X, y = dataset_loading_func() + X, y = dataset_loading_func(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=dataset_parameters.test_size, @@ -93,3 +106,20 @@ class DatasetLoader(object): return Dataset(task, X_train, X_dev, X_test, y_train, y_dev, y_test) + + @staticmethod + def load_default(dataset_name, seed): + begin_random_seed_range = 1 + end_random_seed_range = 2000 + + seed = seed if seed else random.randint(begin_random_seed_range, end_random_seed_range) + + dataset_parameters = DatasetParameters( + name=dataset_name, + test_size=DatasetLoader.DEFAULT_TEST_SIZE, + dev_size=DatasetLoader.DEFAULT_DEV_SIZE, + random_state=seed, + dataset_normalizer=DatasetLoader.DEFAULT_DATASET_NORMALIZER + ) + + return DatasetLoader.load(dataset_parameters) diff --git a/code/compute_hyperparameters.py b/code/compute_hyperparameters.py index 0f7aa3a666d61618a3a5d50b1de8e996c235034c..510fd9891f4f986e0ec7e10ea0bc72a18fcf53a6 100644 --- a/code/compute_hyperparameters.py +++ b/code/compute_hyperparameters.py @@ -54,29 +54,18 @@ if __name__ == "__main__": parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.') parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.') parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.') - args = parser.parse_args() logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) - - begin_random_seed_range = 1 - end_random_seed_range = 2000 - - if args.seed is None: - random_seed = random.randint(begin_random_seed_range, end_random_seed_range) - else: - random_seed = args.seed + random_seed = args.seed for dataset_name in args.datasets: - dataset_dir = os.path.join('experiments', dataset_name, 'stage1') - pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True) logger.info('Bayesian search on dataset {}'.format(dataset_name)) - dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None) - dataset = DatasetLoader.load(dataset_parameters) + dataset = DatasetLoader.load_default(dataset_name, random_seed) if dataset.task == Task.REGRESSION: scorer = 'neg_mean_squared_error' diff --git a/code/train.py b/code/train.py index 0d9713252b0e5e2345331952edaca6adfa5424c0..507c7ddf69df819caa746dfa0ec106f183625c16 100644 --- a/code/train.py +++ b/code/train.py @@ -76,20 +76,9 @@ def process_job(seed, parameters, experiment_id, hyperparameters): if __name__ == "__main__": load_dotenv(find_dotenv('.env')) - DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' - DEFAULT_DATASET_NAME = 'boston' - DEFAULT_NORMALIZE_D = False - DEFAULT_DATASET_NORMALIZER = None - DEFAULT_FOREST_SIZE = 100 - DEFAULT_EXTRACTED_FOREST_SIZE = 10 # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} - DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' - DEFAULT_DEV_SIZE = 0.2 - DEFAULT_TEST_SIZE = 0.2 - DEFAULT_RANDOM_SEED_NUMBER = 1 - DEFAULT_SUBSETS_USED = 'train,dev' - DEFAULT_NORMALIZE_WEIGHTS = False + DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models' begin_random_seed_range = 1 end_random_seed_range = 2000 @@ -97,18 +86,18 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.') parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.') - parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') - parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') - parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') - parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') - parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') + parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') + parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') + parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') + parser.add_argument('--forest_size', nargs='?', type=int, default=DatasetLoader.DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') + parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') - parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.') - parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') - parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') + parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.') + parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.') + parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') - parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') - parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') + parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') + parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') args = parser.parse_args() if args.experiment_configuration: @@ -156,7 +145,7 @@ if __name__ == "__main__": keep trace of it. """ if args.experiment_configuration is None: - with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format( + with open(args.experiment_configuration_path + os.sep + 'unnamed' + os.sep + 'unnamed_{}.json'.format( experiment_id), 'w') as output_file: json.dump( parameters,