Skip to content
Snippets Groups Projects
Commit 6a6cf747 authored by Charly Lamothe's avatar Charly Lamothe
Browse files

- Reduce the extracted forest sizes upper bound and number because OMP seems...

- Reduce the extracted forest sizes upper bound and number because OMP seems to converge only with small forest sizes;
- Add extraction_strategy parameter in order to save base forest and the forests trained with the same size as the extracted forest sizes used in the experiment that used OMP.
parent fd6dbc7b
No related branches found
No related tags found
1 merge request!9Resolve "Experiment pipeline"
...@@ -19,8 +19,8 @@ class DatasetLoader(object): ...@@ -19,8 +19,8 @@ class DatasetLoader(object):
DEFAULT_NORMALIZE_D = False DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = 'standard' DEFAULT_DATASET_NORMALIZER = 'standard'
DEFAULT_FOREST_SIZE = 100 DEFAULT_FOREST_SIZE = 100
DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 10 DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES = 5
DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.3 DEFAULT_EXTRACTED_FOREST_SIZE_STOP = 0.1
DEFAULT_DEV_SIZE = 0.2 DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1 DEFAULT_RANDOM_SEED_NUMBER = 1
......
...@@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor ...@@ -3,6 +3,7 @@ from bolsonaro.models.omp_forest_regressor import OmpForestRegressor
from bolsonaro.data.task import Task from bolsonaro.data.task import Task
from bolsonaro.models.model_parameters import ModelParameters from bolsonaro.models.model_parameters import ModelParameters
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import os import os
import pickle import pickle
...@@ -11,12 +12,33 @@ class ModelFactory(object): ...@@ -11,12 +12,33 @@ class ModelFactory(object):
@staticmethod @staticmethod
def build(task, model_parameters): def build(task, model_parameters):
if task not in [Task.BINARYCLASSIFICATION, Task.REGRESSION, Task.MULTICLASSIFICATION]:
raise ValueError("Unsupported task '{}'".format(task))
if task == Task.BINARYCLASSIFICATION: if task == Task.BINARYCLASSIFICATION:
model_func = OmpForestBinaryClassifier if model_parameters.extraction_strategy == 'omp':
return OmpForestBinaryClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
elif task == Task.REGRESSION: elif task == Task.REGRESSION:
model_func = OmpForestRegressor if model_parameters.extraction_strategy == 'omp':
return OmpForestRegressor(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestRegressor(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else:
return RandomForestRegressor(n_estimators=model_parameters.hyperparameters['n_estimators'],
random_state=model_parameters.seed)
elif task == Task.MULTICLASSIFICATION: elif task == Task.MULTICLASSIFICATION:
model_func = OmpForestMulticlassClassifier if model_parameters.extraction_strategy == 'omp':
return OmpForestMulticlassClassifier(model_parameters)
elif model_parameters.extraction_strategy == 'random':
return RandomForestClassifier(n_estimators=model_parameters.extracted_forest_size,
random_state=model_parameters.seed)
else: else:
raise ValueError("Unsupported task '{}'".format(task)) return RandomForestClassifier(n_estimators=model_parameters.hyperparameters['n_estimators'],
return model_func(model_parameters) random_state=model_parameters.seed)
...@@ -5,13 +5,15 @@ import os ...@@ -5,13 +5,15 @@ import os
class ModelParameters(object): class ModelParameters(object):
def __init__(self, extracted_forest_size, normalize_D, subsets_used, normalize_weights, seed, hyperparameters): def __init__(self, extracted_forest_size, normalize_D, subsets_used,
normalize_weights, seed, hyperparameters, extraction_strategy):
self._extracted_forest_size = extracted_forest_size self._extracted_forest_size = extracted_forest_size
self._normalize_D = normalize_D self._normalize_D = normalize_D
self._subsets_used = subsets_used self._subsets_used = subsets_used
self._normalize_weights = normalize_weights self._normalize_weights = normalize_weights
self._seed = seed self._seed = seed
self._hyperparameters = hyperparameters self._hyperparameters = hyperparameters
self._extraction_strategy = extraction_strategy
@property @property
def extracted_forest_size(self): def extracted_forest_size(self):
...@@ -37,6 +39,10 @@ class ModelParameters(object): ...@@ -37,6 +39,10 @@ class ModelParameters(object):
def hyperparameters(self): def hyperparameters(self):
return self._hyperparameters return self._hyperparameters
@property
def extraction_strategy(self):
return self._extraction_strategy
def save(self, directory_path, experiment_id): def save(self, directory_path, experiment_id):
save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id), save_obj_to_json(directory_path + os.sep + 'model_parameters_{}.json'.format(experiment_id),
self.__dict__) self.__dict__)
......
...@@ -2,6 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults ...@@ -2,6 +2,7 @@ from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.error_handling.logger_factory import LoggerFactory from bolsonaro.error_handling.logger_factory import LoggerFactory
from . import LOG_PATH from . import LOG_PATH
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import time import time
import datetime import datetime
import numpy as np import numpy as np
...@@ -21,7 +22,11 @@ class Trainer(object): ...@@ -21,7 +22,11 @@ class Trainer(object):
self._logger = LoggerFactory.create(LOG_PATH, __name__) self._logger = LoggerFactory.create(LOG_PATH, __name__)
def init(self, model): def init(self, model):
if model.models_parameters.subsets_used == 'train,dev': if type(model) in [RandomForestRegressor, RandomForestClassifier]:
self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train
self._logger.debug('Fitting the forest on train subset')
elif model.models_parameters.subsets_used == 'train,dev':
self._X_forest = self._dataset.X_train self._X_forest = self._dataset.X_train
self._y_forest = self._dataset.y_train self._y_forest = self._dataset.y_train
self._X_omp = self._dataset.X_dev self._X_omp = self._dataset.X_dev
...@@ -49,6 +54,12 @@ class Trainer(object): ...@@ -49,6 +54,12 @@ class Trainer(object):
self._logger.debug('Training model using train set...') self._logger.debug('Training model using train set...')
self._begin_time = time.time() self._begin_time = time.time()
if type(model) in [RandomForestRegressor, RandomForestClassifier]:
model.fit(
X=self._X_forest,
y=self._y_forest
)
else:
model.fit( model.fit(
X_forest=self._X_forest, X_forest=self._X_forest,
y_forest=self._y_forest, y_forest=self._y_forest,
...@@ -62,6 +73,8 @@ class Trainer(object): ...@@ -62,6 +73,8 @@ class Trainer(object):
:param model: Object with :param model: Object with
:param models_dir: Where the results will be saved :param models_dir: Where the results will be saved
""" """
score_func = model.score if type(model) in [RandomForestRegressor, RandomForestClassifier] \
else model.score_base_estimator
results = ModelRawResults( results = ModelRawResults(
model_object=model, model_object=model,
training_time=self._end_time - self._begin_time, training_time=self._end_time - self._begin_time,
...@@ -69,10 +82,11 @@ class Trainer(object): ...@@ -69,10 +82,11 @@ class Trainer(object):
train_score=model.score(self._dataset.X_train, self._dataset.y_train), train_score=model.score(self._dataset.X_train, self._dataset.y_train),
dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev), dev_score=model.score(self._dataset.X_dev, self._dataset.y_dev),
test_score=model.score(self._dataset.X_test, self._dataset.y_test), test_score=model.score(self._dataset.X_test, self._dataset.y_test),
score_metric=model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way score_metric='mse' if type(model) in [RandomForestRegressor, RandomForestClassifier] \
train_score_regressor=model.score_base_estimator(self._dataset.X_train, self._dataset.y_train), else model.DEFAULT_SCORE_METRIC, # TODO: resolve the used metric in a proper way
dev_score_regressor=model.score_base_estimator(self._dataset.X_dev, self._dataset.y_dev), train_score_regressor=score_func(self._dataset.X_train, self._dataset.y_train),
test_score_regressor=model.score_base_estimator(self._dataset.X_test, self._dataset.y_test) dev_score_regressor=score_func(self._dataset.X_dev, self._dataset.y_dev),
test_score_regressor=score_func(self._dataset.X_test, self._dataset.y_test)
) )
results.save(models_dir) results.save(models_dir)
self._logger.info("Base performance on test: {}".format(results.test_score_regressor)) self._logger.info("Base performance on test: {}".format(results.test_score_regressor))
......
...@@ -18,19 +18,29 @@ if __name__ == "__main__": ...@@ -18,19 +18,29 @@ if __name__ == "__main__":
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--stage_number', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].') parser.add_argument('--stage', nargs='?', type=int, required=True, help='Specify the stage number among [1, 4].')
parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).') parser.add_argument('--experiment_ids', nargs='+', type=int, required=True, help='Compute the results of the specified experiment id(s).' + \
'stage=1: {{base_with_params}} {{random_with_params}} {{omp_with_params}} {{base_wo_params}} {{random_wo_params}} {{omp_wo_params}}')
parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.') parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.') parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
args = parser.parse_args() args = parser.parse_args()
if args.stage_number not in list(range(1, 5)): if args.stage not in list(range(1, 5)):
raise ValueError('stage_number must be a supported stage id (i.e. [1, 4]).') raise ValueError('stage must be a supported stage id (i.e. [1, 4]).')
# Create recursively the results dir tree # Create recursively the results dir tree
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
if args.stage_number == 1: if args.stage == 1:
# First axis:
# base_with_params
# random_with_params
# omp_with_params
# Second axis:
# base_wo_params
# random_wo_params
# base_wo_params
for experiment_id in args.experiment_ids: for experiment_id in args.experiment_ids:
experiment_id_path = args.models_dir + os.sep + str(experiment_id) # models/{experiment_id} experiment_id_path = args.models_dir + os.sep + str(experiment_id) # models/{experiment_id}
# Create recursively the tree results/{experiment_id} # Create recursively the tree results/{experiment_id}
...@@ -50,7 +60,9 @@ if __name__ == "__main__": ...@@ -50,7 +60,9 @@ if __name__ == "__main__":
experiment_score_metrics = list() experiment_score_metrics = list()
# For each seed results stored in models/{experiment_id}/seeds # For each seed results stored in models/{experiment_id}/seeds
for seed in os.listdir(experiment_seed_root_path): seeds = os.listdir(experiment_seed_root_path)
seeds.sort(key=int)
for seed in seeds:
experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed} experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed
dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters
......
...@@ -53,8 +53,8 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -53,8 +53,8 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer = Trainer(dataset) trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']: for extracted_forest_size in parameters['extracted_forest_size']:
# question if training is too long, one may also split experiments for different forest sizes into different workers
logger.info('extracted_forest_size={}'.format(extracted_forest_size)) logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
...@@ -65,7 +65,8 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -65,7 +65,8 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
subsets_used=parameters['subsets_used'], subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'], normalize_weights=parameters['normalize_weights'],
seed=seed, seed=seed,
hyperparameters=hyperparameters hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
...@@ -74,9 +75,40 @@ def process_job(seed, parameters, experiment_id, hyperparameters): ...@@ -74,9 +75,40 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model) trainer.init(model)
trainer.train(model) trainer.train(model)
trainer.compute_results(model, sub_models_dir) trainer.compute_results(model, sub_models_dir)
logger.info('Training done') else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + str(forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters,
extraction_strategy=parameters['extraction_strategy']
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model)
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
"""
Example for stage 1:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --save_experiment_configuration 1 none_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --save_experiment_configuration 1 random_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --save_experiment_configuration 1 omp_with_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params
python code/train.py --dataset_name=california_housing --seeds 1 2 3 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params
python code/compute_results.py --stage_number 1 --experiment_ids 1 2 3 4 5 6
"""
if __name__ == "__main__": if __name__ == "__main__":
load_dotenv(find_dotenv('.env')) load_dotenv(find_dotenv('.env'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
...@@ -85,6 +117,7 @@ if __name__ == "__main__": ...@@ -85,6 +117,7 @@ if __name__ == "__main__":
DEFAULT_VERBOSE = False DEFAULT_VERBOSE = False
DEFAULT_SKIP_BEST_HYPERPARAMS = False DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1 DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp'
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
...@@ -109,6 +142,7 @@ if __name__ == "__main__": ...@@ -109,6 +142,7 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.') parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}') parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.') parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random or none.')
args = parser.parse_args() args = parser.parse_args()
if args.experiment_configuration: if args.experiment_configuration:
...@@ -118,6 +152,9 @@ if __name__ == "__main__": ...@@ -118,6 +152,9 @@ if __name__ == "__main__":
else: else:
parameters = args.__dict__ parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment