diff --git a/code/bolsonaro/data/dataset_loader.py b/code/bolsonaro/data/dataset_loader.py index b76f7408fe8c0b7937bc5d84d0bab397e441ff09..d706b7a07751f715b6398b1f451ec9f337d00f60 100644 --- a/code/bolsonaro/data/dataset_loader.py +++ b/code/bolsonaro/data/dataset_loader.py @@ -6,6 +6,7 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing from sklearn.model_selection import train_test_split +from sklearn import preprocessing class DatasetLoader(object): @@ -71,9 +72,20 @@ class DatasetLoader(object): test_size=dataset_parameters.dev_size, random_state=dataset_parameters.random_state) - # TODO? - if dataset_parameters.normalize: - pass + if dataset_parameters.dataset_normalizer is not None: + if dataset_parameters.dataset_normalizer == 'standard': + scaler = preprocessing.StandardScaler() + elif dataset_parameters.dataset_normalizer == 'minmax': + scaler = preprocessing.MinMaxScaler() + elif dataset_parameters.dataset_normalizer == 'robust': + scaler = preprocessing.RobustScaler() + elif dataset_parameters.dataset_normalizer == 'normalizer': + scaler = preprocessing.Normalizer() + else: + raise ValueError("Unsupported normalizer '{}'".format(dataset_parameters.dataset_normalizer)) + X_train = scaler.fit_transform(X_train) + X_dev = scaler.transform(X_dev) + X_test = scaler.transform(X_test) return Dataset(task, dataset_parameters, X_train, X_dev, X_test, y_train, y_dev, y_test) diff --git a/code/bolsonaro/data/dataset_parameters.py b/code/bolsonaro/data/dataset_parameters.py index d5a1145d3dc4a3796dbdd1b38aa50325262b7e3e..9e7e7def56f205e342257bcb8849ece952177042 100644 --- a/code/bolsonaro/data/dataset_parameters.py +++ b/code/bolsonaro/data/dataset_parameters.py @@ -4,13 +4,13 @@ import os class DatasetParameters(object): - def __init__(self, name, test_size, dev_size, random_state, normalize, train_on_subset): + def __init__(self, name, test_size, dev_size, random_state, train_on_subset, dataset_normalizer): self._name = name self._test_size = test_size self._dev_size = dev_size self._random_state = random_state - self._normalize = normalize self._train_on_subset = train_on_subset + self._dataset_normalizer = dataset_normalizer @property def name(self): @@ -28,14 +28,14 @@ class DatasetParameters(object): def random_state(self): return self._random_state - @property - def normalize(self): - return self._normalize - @property def train_on_subset(self): return self._train_on_subset + @property + def dataset_normalizer(self): + return self._dataset_normalizer + def save(self, directory_path, experiment_id): with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file: json.dump({ @@ -43,7 +43,7 @@ class DatasetParameters(object): 'test_size': self._test_size, 'dev_size': self._dev_size, 'random_state': self._random_state, - 'normalize': self._normalize, + 'dataset_normalizer': self._dataset_normalizer, 'train_on_subset': self._train_on_subset }, output_file, @@ -58,6 +58,6 @@ class DatasetParameters(object): test_size=parameters['test_size'], dev_size=parameters['dev_size'], random_state=parameters['random_state'], - normalize=parameters['normalize'], + dataset_normalizer=parameters['dataset_normalizer'], train_on_subset=parameters['train_on_subset'] ) diff --git a/code/bolsonaro/models/model_parameters.py b/code/bolsonaro/models/model_parameters.py index 838253255e2a9a71290633e026090b70e680e201..93bb177270bb1d261234664a5b9ee06898e00165 100644 --- a/code/bolsonaro/models/model_parameters.py +++ b/code/bolsonaro/models/model_parameters.py @@ -4,11 +4,11 @@ import os class ModelParameters(object): - def __init__(self, forest_size, extracted_forest_size, normalize, seed=None): + def __init__(self, forest_size, extracted_forest_size, normalize_D, seed=None): self._forest_size = forest_size self._extracted_forest_size = extracted_forest_size self._seed = seed - self._normalize = normalize + self._normalize_D = normalize_D @property def forest_size(self): @@ -23,8 +23,8 @@ class ModelParameters(object): return self._seed @property - def normalize(self): - return self._normalize + def normalize_D(self): + return self._normalize_D def save(self, directory_path, experiment_id): with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file: @@ -32,7 +32,7 @@ class ModelParameters(object): 'forest_size': self._forest_size, 'extracted_forest_size': self._extracted_forest_size, 'seed': self._seed, - 'normalize': self._normalize + 'normalize_D': self._normalize_D }, output_file, indent=4) @@ -45,5 +45,5 @@ class ModelParameters(object): forest_size=parameters['forest_size'], extracted_forest_size=parameters['extracted_forest_size'], seed=parameters['seed'], - normalize=parameters['normalize'] + normalize_D=parameters['normalize_D'] ) diff --git a/code/bolsonaro/models/omp_forest_regressor.py b/code/bolsonaro/models/omp_forest_regressor.py index f5e2b35f23f931a06ca88ce181434f4ff98cce23..5c79c2a4623c0dabe2dd4bdaf47329174f605f3e 100644 --- a/code/bolsonaro/models/omp_forest_regressor.py +++ b/code/bolsonaro/models/omp_forest_regressor.py @@ -53,7 +53,7 @@ class OmpForestRegressor(BaseEstimator): """ D = self._forest_prediction(X) - if self._models_parameters.normalize: + if self._models_parameters.normalize_D: D /= self._forest_norms predictions = D @ self.weights @@ -97,7 +97,7 @@ class OmpForestRegressor(BaseEstimator): self._logger.debug("Forest make prediction on X") D = self._forest_prediction(X) - if self._models_parameters.normalize: + if self._models_parameters.normalize_D: # question: maybe consider other kinds of normalization self._logger.debug("Compute norm of predicted vectors on X") self._forest_norms = np.linalg.norm(D, axis=0) diff --git a/code/train.py b/code/train.py index a410e53bb4ccefc90f5f976617da37e8b709cb40..5f854b68afedea8574001d7c26ef4deaee46f248 100644 --- a/code/train.py +++ b/code/train.py @@ -21,7 +21,8 @@ if __name__ == "__main__": default_dataset_name = 'boston' default_normalize = True - default_wo_normalization = False + default_normalize_D = False + default_dataset_normalizer = None default_forest_size = 100 default_extracted_forest_size = 10 # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} @@ -35,7 +36,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') - parser.add_argument('--wo_normalization', action='store_true', default=default_wo_normalization, help='Withouyt normalize the data by doing the L2 division of the pred vectors.') + parser.add_argument('--normalize_D', action='store_true', default=default_normalize_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') + parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=default_dataset_normalizer, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') @@ -61,9 +63,6 @@ if __name__ == "__main__": else [random.randint(begin_random_seed_range, end_random_seed_range) \ for i in range(args.random_seed_number)] - normalize = default_normalize and args.wo_normalization is False - logger.debug('normalize={}'.format(normalize)) - experiment_id = resolve_experiment_id(args.models_dir) experiment_id_str = str(experiment_id) @@ -80,7 +79,7 @@ if __name__ == "__main__": test_size=args.test_size, dev_size=args.dev_size, random_state=seed, - normalize=normalize, + dataset_normalizer=args.dataset_normalizer, train_on_subset=args.train_on_subset ) dataset_parameters.save(models_dir, experiment_id_str) @@ -99,7 +98,7 @@ if __name__ == "__main__": forest_size=args.forest_size, extracted_forest_size=extracted_forest_size, seed=seed, - normalize=normalize + normalize_D=args.normalize_D ) model_parameters.save(sub_models_dir, experiment_id)