Skip to content
Snippets Groups Projects
Commit 00ed2453 authored by Charly LAMOTHE's avatar Charly LAMOTHE
Browse files

Add dataset normalization, and distinguish D normalization and dataset normalization in the options

parent 2cc31d8c
No related branches found
No related tags found
1 merge request!3clean scripts
...@@ -6,6 +6,7 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ ...@@ -6,6 +6,7 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \ fetch_20newsgroups_vectorized, fetch_lfw_people, fetch_lfw_pairs, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn import preprocessing
class DatasetLoader(object): class DatasetLoader(object):
...@@ -71,9 +72,20 @@ class DatasetLoader(object): ...@@ -71,9 +72,20 @@ class DatasetLoader(object):
test_size=dataset_parameters.dev_size, test_size=dataset_parameters.dev_size,
random_state=dataset_parameters.random_state) random_state=dataset_parameters.random_state)
# TODO? if dataset_parameters.dataset_normalizer is not None:
if dataset_parameters.normalize: if dataset_parameters.dataset_normalizer == 'standard':
pass scaler = preprocessing.StandardScaler()
elif dataset_parameters.dataset_normalizer == 'minmax':
scaler = preprocessing.MinMaxScaler()
elif dataset_parameters.dataset_normalizer == 'robust':
scaler = preprocessing.RobustScaler()
elif dataset_parameters.dataset_normalizer == 'normalizer':
scaler = preprocessing.Normalizer()
else:
raise ValueError("Unsupported normalizer '{}'".format(dataset_parameters.dataset_normalizer))
X_train = scaler.fit_transform(X_train)
X_dev = scaler.transform(X_dev)
X_test = scaler.transform(X_test)
return Dataset(task, dataset_parameters, X_train, return Dataset(task, dataset_parameters, X_train,
X_dev, X_test, y_train, y_dev, y_test) X_dev, X_test, y_train, y_dev, y_test)
...@@ -4,13 +4,13 @@ import os ...@@ -4,13 +4,13 @@ import os
class DatasetParameters(object): class DatasetParameters(object):
def __init__(self, name, test_size, dev_size, random_state, normalize, train_on_subset): def __init__(self, name, test_size, dev_size, random_state, train_on_subset, dataset_normalizer):
self._name = name self._name = name
self._test_size = test_size self._test_size = test_size
self._dev_size = dev_size self._dev_size = dev_size
self._random_state = random_state self._random_state = random_state
self._normalize = normalize
self._train_on_subset = train_on_subset self._train_on_subset = train_on_subset
self._dataset_normalizer = dataset_normalizer
@property @property
def name(self): def name(self):
...@@ -28,14 +28,14 @@ class DatasetParameters(object): ...@@ -28,14 +28,14 @@ class DatasetParameters(object):
def random_state(self): def random_state(self):
return self._random_state return self._random_state
@property
def normalize(self):
return self._normalize
@property @property
def train_on_subset(self): def train_on_subset(self):
return self._train_on_subset return self._train_on_subset
@property
def dataset_normalizer(self):
return self._dataset_normalizer
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file: with open(directory_path + os.sep + 'dataset_parameters_{}.json'.format(experiment_id), 'w') as output_file:
json.dump({ json.dump({
...@@ -43,7 +43,7 @@ class DatasetParameters(object): ...@@ -43,7 +43,7 @@ class DatasetParameters(object):
'test_size': self._test_size, 'test_size': self._test_size,
'dev_size': self._dev_size, 'dev_size': self._dev_size,
'random_state': self._random_state, 'random_state': self._random_state,
'normalize': self._normalize, 'dataset_normalizer': self._dataset_normalizer,
'train_on_subset': self._train_on_subset 'train_on_subset': self._train_on_subset
}, },
output_file, output_file,
...@@ -58,6 +58,6 @@ class DatasetParameters(object): ...@@ -58,6 +58,6 @@ class DatasetParameters(object):
test_size=parameters['test_size'], test_size=parameters['test_size'],
dev_size=parameters['dev_size'], dev_size=parameters['dev_size'],
random_state=parameters['random_state'], random_state=parameters['random_state'],
normalize=parameters['normalize'], dataset_normalizer=parameters['dataset_normalizer'],
train_on_subset=parameters['train_on_subset'] train_on_subset=parameters['train_on_subset']
) )
...@@ -4,11 +4,11 @@ import os ...@@ -4,11 +4,11 @@ import os
class ModelParameters(object): class ModelParameters(object):
def __init__(self, forest_size, extracted_forest_size, normalize, seed=None): def __init__(self, forest_size, extracted_forest_size, normalize_D, seed=None):
self._forest_size = forest_size self._forest_size = forest_size
self._extracted_forest_size = extracted_forest_size self._extracted_forest_size = extracted_forest_size
self._seed = seed self._seed = seed
self._normalize = normalize self._normalize_D = normalize_D
@property @property
def forest_size(self): def forest_size(self):
...@@ -23,8 +23,8 @@ class ModelParameters(object): ...@@ -23,8 +23,8 @@ class ModelParameters(object):
return self._seed return self._seed
@property @property
def normalize(self): def normalize_D(self):
return self._normalize return self._normalize_D
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file: with open(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), 'w') as output_file:
...@@ -32,7 +32,7 @@ class ModelParameters(object): ...@@ -32,7 +32,7 @@ class ModelParameters(object):
'forest_size': self._forest_size, 'forest_size': self._forest_size,
'extracted_forest_size': self._extracted_forest_size, 'extracted_forest_size': self._extracted_forest_size,
'seed': self._seed, 'seed': self._seed,
'normalize': self._normalize 'normalize_D': self._normalize_D
}, },
output_file, output_file,
indent=4) indent=4)
...@@ -45,5 +45,5 @@ class ModelParameters(object): ...@@ -45,5 +45,5 @@ class ModelParameters(object):
forest_size=parameters['forest_size'], forest_size=parameters['forest_size'],
extracted_forest_size=parameters['extracted_forest_size'], extracted_forest_size=parameters['extracted_forest_size'],
seed=parameters['seed'], seed=parameters['seed'],
normalize=parameters['normalize'] normalize_D=parameters['normalize_D']
) )
...@@ -53,7 +53,7 @@ class OmpForestRegressor(BaseEstimator): ...@@ -53,7 +53,7 @@ class OmpForestRegressor(BaseEstimator):
""" """
D = self._forest_prediction(X) D = self._forest_prediction(X)
if self._models_parameters.normalize: if self._models_parameters.normalize_D:
D /= self._forest_norms D /= self._forest_norms
predictions = D @ self.weights predictions = D @ self.weights
...@@ -97,7 +97,7 @@ class OmpForestRegressor(BaseEstimator): ...@@ -97,7 +97,7 @@ class OmpForestRegressor(BaseEstimator):
self._logger.debug("Forest make prediction on X") self._logger.debug("Forest make prediction on X")
D = self._forest_prediction(X) D = self._forest_prediction(X)
if self._models_parameters.normalize: if self._models_parameters.normalize_D:
# question: maybe consider other kinds of normalization # question: maybe consider other kinds of normalization
self._logger.debug("Compute norm of predicted vectors on X") self._logger.debug("Compute norm of predicted vectors on X")
self._forest_norms = np.linalg.norm(D, axis=0) self._forest_norms = np.linalg.norm(D, axis=0)
......
...@@ -21,7 +21,8 @@ if __name__ == "__main__": ...@@ -21,7 +21,8 @@ if __name__ == "__main__":
default_dataset_name = 'boston' default_dataset_name = 'boston'
default_normalize = True default_normalize = True
default_wo_normalization = False default_normalize_D = False
default_dataset_normalizer = None
default_forest_size = 100 default_forest_size = 100
default_extracted_forest_size = 10 default_extracted_forest_size = 10
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
...@@ -35,7 +36,8 @@ if __name__ == "__main__": ...@@ -35,7 +36,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--wo_normalization', action='store_true', default=default_wo_normalization, help='Withouyt normalize the data by doing the L2 division of the pred vectors.') parser.add_argument('--normalize_D', action='store_true', default=default_normalize_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=default_dataset_normalizer, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.') parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
...@@ -61,9 +63,6 @@ if __name__ == "__main__": ...@@ -61,9 +63,6 @@ if __name__ == "__main__":
else [random.randint(begin_random_seed_range, end_random_seed_range) \ else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)] for i in range(args.random_seed_number)]
normalize = default_normalize and args.wo_normalization is False
logger.debug('normalize={}'.format(normalize))
experiment_id = resolve_experiment_id(args.models_dir) experiment_id = resolve_experiment_id(args.models_dir)
experiment_id_str = str(experiment_id) experiment_id_str = str(experiment_id)
...@@ -80,7 +79,7 @@ if __name__ == "__main__": ...@@ -80,7 +79,7 @@ if __name__ == "__main__":
test_size=args.test_size, test_size=args.test_size,
dev_size=args.dev_size, dev_size=args.dev_size,
random_state=seed, random_state=seed,
normalize=normalize, dataset_normalizer=args.dataset_normalizer,
train_on_subset=args.train_on_subset train_on_subset=args.train_on_subset
) )
dataset_parameters.save(models_dir, experiment_id_str) dataset_parameters.save(models_dir, experiment_id_str)
...@@ -99,7 +98,7 @@ if __name__ == "__main__": ...@@ -99,7 +98,7 @@ if __name__ == "__main__":
forest_size=args.forest_size, forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size, extracted_forest_size=extracted_forest_size,
seed=seed, seed=seed,
normalize=normalize normalize_D=args.normalize_D
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment