Skip to content
Snippets Groups Projects
Commit 6a6cf747 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Reduce the extracted forest sizes upper bound and number because OMP seems...

- Reduce the extracted forest sizes upper bound and number because OMP seems to converge only with small forest sizes;
- Add extraction_strategy parameter in order to save base forest and the forests trained with the same size as the extracted forest sizes used in the experiment that used OMP.
parent fd6dbc7b
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
This commit is part of merge request !9. Comments created here will be created in the context of that merge request.
......@@ -19,8 +19,8 @@ class DatasetLoader(object):
DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = 'standard'
DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 10
DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.3
DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 5
DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.1
DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
......
......@@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import os
import pickle
......@@ -11,12 +12,33 @@ class ModelFactory(object):
@staticmethod
def build(task, model_parameters):
if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]:
raise ValueError("Unsupported task '{}'".format(task))
if task == Task.BINARYCLASSIFICATION:
model_func = OmpForestBinaryClassifier
if model_parameters.extraction_strategy == 'omp':
return OmpForestBinaryClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
elif task == Task.REGRESSION:
model_func = OmpForestRegressor
if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
elif task == Task.MULTICLASSIFICATION:
model_func = OmpForestMulticlassClassifier
if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
raise ValueError("Unsupported task '{}'".format(task))
return model_func(model_parameters)
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
......@@ -5,13 +5,15 @@ import os
class ModelParameters(object):
def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters):
def __init__(self, extracted_forest_size, normalize_D, subsets_used,
normalize_weights, seed, hyperparameters, extraction_strategy):
self._extracted_forest_size = extracted_forest_size
self._normalize_D = normalize_D
self._subsets_used = subsets_used
self._normalize_weights = normalize_weights
self._seed = seed
self._hyperparameters = hyperparameters
self._extraction_strategy = extraction_strategy
@property
def extracted_forest_size(self):
......@@ -37,6 +39,10 @@ class ModelParameters(object):
def hyperparameters(self):
return self._hyperparameters
@property
def extraction_strategy(self):
return self._extraction_strategy
def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id),
self.__dict__)
......
......@@ -2,6 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.error_handling.logger_factory import LoggerFactory
from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import time
import datetime
import numpy as np
......@@ -21,7 +22,11 @@ class Trainer(object):
self._logger = LoggerFactory.create(LOG_PATH, __name__)
def init(self, model):
if model.models_parameters.subsets_used == 'train,dev':
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._logger.debug('Fitting the forest on train subset')
elif model.models_parameters.subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._X_omp = self._dataset.X_dev
......@@ -49,6 +54,12 @@ class Trainer(object):
self._logger.debug('Training model using train set...')
self._begin_time = time.time()
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
model.fit(
X=self._X_forest,
y=self._y_forest
)
else:
model.fit(
X_forest=self._X_forest,
y_forest=self._y_forest,
......@@ -62,6 +73,8 @@ class Trainer(object):
:param model: Object with
:param models_dir: Where the results will be saved
"""
score_func = model.score if type(model) in [RandomForestRegressor, RandomForestClassifier] \
else model.score_base_estimator
results = ModelRawResults(
model_object=model,
training_time=self._end_time - self._begin_time,
......@@ -69,10 +82,11 @@ class Trainer(object):
train_score=model.score(self._dataset.X_train, self._dataset.y_train),
dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev),
test_score=model.score(self._dataset.X_test, self._dataset.y_test),
score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way
train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test)
score_metric='mse' if type(model) in [RandomForestRegressor, RandomForestClassifier] \
else model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way
train_score_regressor=score_func(self._dataset.X_train, self._dataset.y_train),
dev_score_regressor=score_func(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=score_func(self._dataset.X_test, self._dataset.y_test)
)
results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_regressor))
......
......@@ -18,19 +18,29 @@ if __name__ == "__main__":
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage_number', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].')
parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).')
parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].')
parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \
'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
args = parser.parse_args()
if args.stage_number not in list(range(1, 5)):
raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).')
if args.stage not in list(range(1, 5)):
raise ValueError('stage must be a supported stage id (i.e. [1, 4]).')
# Create recursively the results dir tree
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
if args.stage_number == 1:
if args.stage == 1:
# First axis:
# base_with_params
# random_with_params
# omp_with_params
# Second axis:
# base_wo_params
# random_wo_params
# base_wo_params
for experiment_id in args.experiment_ids:
experiment_id_path = args.models_dir + os.sep + str(experiment_id) # models/{experiment_id}
# Create recursively the tree results/{experiment_id}
......@@ -50,7 +60,9 @@ if __name__ == "__main__":
experiment_score_metrics = list()
# For each seed results stored in models/{experiment_id}/seeds
for seed in os.listdir(experiment_seed_root_path):
seeds = os.listdir(experiment_seed_root_path)
seeds.sort(key=int)
for seed in seeds:
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed
dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters
......
......@@ -53,8 +53,8 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']:
# question if training is too long, one may also split experiments for different forest sizes into different workers
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
......@@ -65,7 +65,8 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
......@@ -74,9 +75,40 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model)
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + str(forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model)
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
"""
Example for stage 1:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --save_experiment_configuration 1 none_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --save_experiment_configuration 1 random_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --save_experiment_configuration 1 omp_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params
python code/compute_results.py --stage_number 1 --experiment_ids 1 2 3 4 5 6
"""
if __name__ == "__main__":
load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
......@@ -85,6 +117,7 @@ if __name__ == "__main__":
DEFAULT_VERBOSE = False
DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp'
begin_random_seed_range = 1
end_random_seed_range = 2000
......@@ -109,6 +142,7 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.')
args = parser.parse_args()
if args.experiment_configuration:
......@@ -118,6 +152,9 @@ if __name__ == "__main__":
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment