Skip to content
Snippets Groups Projects
Commit f61314f3 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Add standard dataset scaling for dataset normalization;

- Ignore unamed experiment configuration file backups;
- Factorize default dataset loading parameters;
- Add missing return_X_y in basic dataset loaders.
parent 967742a6
Branches
No related tags found
1 merge request!9Resolve "Experiment pipeline"
models/* models/*
results/* results/*
experiments/unnamed/
*/.kile/* */.kile/*
*.kilepr *.kilepr
......
from bolsonaro.data.dataset import Dataset from bolsonaro.data.dataset import Dataset
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.utils import change_binary_func_load from bolsonaro.utils import change_binary_func_load
...@@ -9,10 +10,22 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \ ...@@ -9,10 +10,22 @@ from sklearn.datasets import fetch_olivetti_faces, fetch_20newsgroups, \
fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing fetch_covtype, fetch_rcv1, fetch_kddcup99, fetch_california_housing
from sklearn.model_selection import train_test_split from sklearn.model_selection import train_test_split
from sklearn import preprocessing from sklearn import preprocessing
import random
class DatasetLoader(object): class DatasetLoader(object):
DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = 'standard'
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE = 10
DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_NORMALIZE_WEIGHTS = False
@staticmethod @staticmethod
def load(dataset_parameters): def load(dataset_parameters):
name = dataset_parameters.name name = dataset_parameters.name
...@@ -67,7 +80,7 @@ class DatasetLoader(object): ...@@ -67,7 +80,7 @@ class DatasetLoader(object):
raise ValueError("Unsupported dataset '{}'".format(name)) raise ValueError("Unsupported dataset '{}'".format(name))
if X is None: if X is None:
X, y = dataset_loading_func() X, y = dataset_loading_func(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=dataset_parameters.test_size, test_size=dataset_parameters.test_size,
...@@ -93,3 +106,20 @@ class DatasetLoader(object): ...@@ -93,3 +106,20 @@ class DatasetLoader(object):
return Dataset(task, X_train, return Dataset(task, X_train,
X_dev, X_test, y_train, y_dev, y_test) X_dev, X_test, y_train, y_dev, y_test)
@staticmethod
def load_default(dataset_name, seed):
begin_random_seed_range = 1
end_random_seed_range = 2000
seed = seed if seed else random.randint(begin_random_seed_range, end_random_seed_range)
dataset_parameters = DatasetParameters(
name=dataset_name,
test_size=DatasetLoader.DEFAULT_TEST_SIZE,
dev_size=DatasetLoader.DEFAULT_DEV_SIZE,
random_state=seed,
dataset_normalizer=DatasetLoader.DEFAULT_DATASET_NORMALIZER
)
return DatasetLoader.load(dataset_parameters)
...@@ -54,29 +54,18 @@ if __name__ == "__main__": ...@@ -54,29 +54,18 @@ if __name__ == "__main__":
parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.') parser.add_argument('--seed', nargs='?', type=int, default=None, help='Specify a seed instead of generate it randomly.')
parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.') parser.add_argument('--datasets', nargs='+', type=str, default=DATASET_LIST, help='Specify the dataset used by the estimator.')
parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.') parser.add_argument('--verbose', action='store_true', default=False, help='Print information during the bayesian search.')
args = parser.parse_args() args = parser.parse_args()
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
begin_random_seed_range = 1
end_random_seed_range = 2000
if args.seed is None:
random_seed = random.randint(begin_random_seed_range, end_random_seed_range)
else:
random_seed = args.seed random_seed = args.seed
for dataset_name in args.datasets: for dataset_name in args.datasets:
dataset_dir = os.path.join('experiments', dataset_name, 'stage1') dataset_dir = os.path.join('experiments', dataset_name, 'stage1')
pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(dataset_dir).mkdir(parents=True, exist_ok=True)
logger.info('Bayesian search on dataset {}'.format(dataset_name)) logger.info('Bayesian search on dataset {}'.format(dataset_name))
dataset_parameters = DatasetParameters(dataset_name, test_size=0.2, dev_size=0.01, random_state=random_seed, dataset_normalizer=None) dataset = DatasetLoader.load_default(dataset_name, random_seed)
dataset = DatasetLoader.load(dataset_parameters)
if dataset.task == Task.REGRESSION: if dataset.task == Task.REGRESSION:
scorer = 'neg_mean_squared_error' scorer = 'neg_mean_squared_error'
......
...@@ -76,20 +76,9 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -76,20 +76,9 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
if __name__ == "__main__": if __name__ == "__main__":
load_dotenv(find_dotenv('.env')) load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = None
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE = 10
# the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees} # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_size/{nb_extracted_trees}
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_NORMALIZE_WEIGHTS = False
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
...@@ -97,18 +86,18 @@ if __name__ == "__main__": ...@@ -97,18 +86,18 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.') parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.') parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
parser.add_argument('--forest_size', nargs='?', type=int, default=DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.') parser.add_argument('--forest_size', nargs='?', type=int, default=DatasetLoader.DEFAULT_FOREST_SIZE, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.') parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE, help='The number of trees selected by OMP.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=DEFAULT_DEV_SIZE, help='Dev subset ratio.') parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--subsets_used', nargs='+', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
args = parser.parse_args() args = parser.parse_args()
if args.experiment_configuration: if args.experiment_configuration:
...@@ -156,7 +145,7 @@ if __name__ == "__main__": ...@@ -156,7 +145,7 @@ if __name__ == "__main__":
keep trace of it. keep trace of it.
""" """
if args.experiment_configuration is None: if args.experiment_configuration is None:
with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format( with open(args.experiment_configuration_path + os.sep + 'unnamed' + os.sep + 'unnamed_{}.json'.format(
experiment_id), 'w') as output_file: experiment_id), 'w') as output_file:
json.dump( json.dump(
parameters, parameters,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment