Skip to content
Snippets Groups Projects
Commit 00ed2453 authored by Charly LAMOTHE's avatar Charly LAMOTHE
Browse files

Add dataset normalization, and distinguish D normalization and dataset normalization in the options

parent 2cc31d8c
No related branches found
No related tags found
1 merge request!3clean scripts
......@@ -6,6 +6,7 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
class DatasetLoader(object):
......@@ -71,9 +72,20 @@ class DatasetLoader(object):
test_size=dataset_parameters.dev_size,
random_state=dataset_parameters.random_state)
# TODO?
if dataset_parameters.normalize:
pass
if dataset_parameters.dataset_normalizer is not None:
if dataset_parameters.dataset_normalizer == 'standard':
scaler = preprocessing.StandardScaler()
elif dataset_parameters.dataset_normalizer == 'minmax':
scaler = preprocessing.MinMaxScaler()
elif dataset_parameters.dataset_normalizer == 'robust':
scaler = preprocessing.RobustScaler()
elif dataset_parameters.dataset_normalizer == 'normalizer':
scaler = preprocessing.Normalizer()
else:
raise ValueError("Unsupported normalizer '{}'".format(dataset_parameters.dataset_normalizer))
X_train = scaler.fit_transform(X_train)
X_dev = scaler.transform(X_dev)
X_test = scaler.transform(X_test)
return Dataset(task, dataset_parameters, X_train,
X_dev, X_test, y_train, y_dev, y_test)
......@@ -4,13 +4,13 @@ import os
class DatasetParameters(object):
def __init__(self, name, test_size, dev_size, random_state, normalize, train_on_subset):
def __init__(self, name, test_size, dev_size, random_state, train_on_subset, dataset_normalizer):
self._name = name
self._test_size = test_size
self._dev_size = dev_size
self._random_state = random_state
self._normalize = normalize
self._train_on_subset = train_on_subset
self._dataset_normalizer = dataset_normalizer
@property
def name(self):
......@@ -28,14 +28,14 @@ class DatasetParameters(object):
def random_state(self):
return self._random_state
@property
def normalize(self):
return self._normalize
@property
def train_on_subset(self):
return self._train_on_subset
@property
def dataset_normalizer(self):
return self._dataset_normalizer
def save(self, directory_path, experiment_id):
with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file:
json.dump({
......@@ -43,7 +43,7 @@ class DatasetParameters(object):
'test_size': self._test_size,
'dev_size': self._dev_size,
'random_state': self._random_state,
'normalize': self._normalize,
'dataset_normalizer': self._dataset_normalizer,
'train_on_subset': self._train_on_subset
},
output_file,
......@@ -58,6 +58,6 @@ class DatasetParameters(object):
test_size=parameters['test_size'],
dev_size=parameters['dev_size'],
random_state=parameters['random_state'],
normalize=parameters['normalize'],
dataset_normalizer=parameters['dataset_normalizer'],
train_on_subset=parameters['train_on_subset']
)
......@@ -4,11 +4,11 @@ import os
class ModelParameters(object):
def __init__(self, forest_size, extracted_forest_size, normalize, seed=None):
def __init__(self, forest_size, extracted_forest_size, normalize_D, seed=None):
self._forest_size = forest_size
self._extracted_forest_size = extracted_forest_size
self._seed = seed
self._normalize = normalize
self._normalize_D = normalize_D
@property
def forest_size(self):
......@@ -23,8 +23,8 @@ class ModelParameters(object):
return self._seed
@property
def normalize(self):
return self._normalize
def normalize_D(self):
return self._normalize_D
def save(self, directory_path, experiment_id):
with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file:
......@@ -32,7 +32,7 @@ class ModelParameters(object):
'forest_size': self._forest_size,
'extracted_forest_size': self._extracted_forest_size,
'seed': self._seed,
'normalize': self._normalize
'normalize_D': self._normalize_D
},
output_file,
indent=4)
......@@ -45,5 +45,5 @@ class ModelParameters(object):
forest_size=parameters['forest_size'],
extracted_forest_size=parameters['extracted_forest_size'],
seed=parameters['seed'],
normalize=parameters['normalize']
normalize_D=parameters['normalize_D']
)
......@@ -53,7 +53,7 @@ class OmpForestRegressor(BaseEstimator):
"""
D = self._forest_prediction(X)
if self._models_parameters.normalize:
if self._models_parameters.normalize_D:
D /= self._forest_norms
predictions = D @ self.weights
......@@ -97,7 +97,7 @@ class OmpForestRegressor(BaseEstimator):
self._logger.debug("Forest make prediction on X")
D = self._forest_prediction(X)
if self._models_parameters.normalize:
if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0)
......
......@@ -21,7 +21,8 @@ if __name__ == "__main__":
default_dataset_name = 'boston'
default_normalize = True
default_wo_normalization = False
default_normalize_D = False
default_dataset_normalizer = None
default_forest_size = 100
default_extracted_forest_size = 10
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
......@@ -35,7 +36,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--wo_normalization', action='store_true', default=default_wo_normalization, help='Withouyt normalize the data by doing the L2 division of the pred vectors.')
parser.add_argument('--normalize_D', action='store_true', default=default_normalize_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=default_dataset_normalizer, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
......@@ -61,9 +63,6 @@ if __name__ == "__main__":
else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)]
normalize = default_normalize and args.wo_normalization is False
logger.debug('normalize={}'.format(normalize))
experiment_id = resolve_experiment_id(args.models_dir)
experiment_id_str = str(experiment_id)
......@@ -80,7 +79,7 @@ if __name__ == "__main__":
test_size=args.test_size,
dev_size=args.dev_size,
random_state=seed,
normalize=normalize,
dataset_normalizer=args.dataset_normalizer,
train_on_subset=args.train_on_subset
)
dataset_parameters.save(models_dir, experiment_id_str)
......@@ -99,7 +98,7 @@ if __name__ == "__main__":
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
seed=seed,
normalize=normalize
normalize_D=args.normalize_D
)
model_parameters.save(sub_models_dir, experiment_id)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment