Skip to content
Snippets Groups Projects
Commit 43486a13 authored by Léo Bouscarrat's avatar Léo Bouscarrat
Browse files

Merge'

(
parents dcff5e38 a0f7c96f
No related branches found
No related tags found
1 merge request!3clean scripts
...@@ -12,7 +12,7 @@ import os ...@@ -12,7 +12,7 @@ import os
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
load_dotenv(find_dotenv('.env.example')) load_dotenv(find_dotenv('.env'))
DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results' DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models' DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
...@@ -59,6 +59,8 @@ if __name__ == "__main__": ...@@ -59,6 +59,8 @@ if __name__ == "__main__":
experiment_dev_scores = dict() experiment_dev_scores = dict()
experiment_test_scores = dict() experiment_test_scores = dict()
experiment_weights = dict()
# Used to check if all losses were computed using the same metric (it should be the case) # Used to check if all losses were computed using the same metric (it should be the case)
experiment_score_metrics = list() experiment_score_metrics = list()
...@@ -74,6 +76,8 @@ if __name__ == "__main__": ...@@ -74,6 +76,8 @@ if __name__ == "__main__":
experiment_dev_scores[seed] = list() experiment_dev_scores[seed] = list()
experiment_test_scores[seed] = list() experiment_test_scores[seed] = list()
experiment_weights[seed] = list()
# List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size
extracted_forest_sizes = os.listdir(extracted_forest_size_root_path) extracted_forest_sizes = os.listdir(extracted_forest_size_root_path)
for extracted_forest_size in extracted_forest_sizes: for extracted_forest_size in extracted_forest_sizes:
...@@ -84,9 +88,13 @@ if __name__ == "__main__": ...@@ -84,9 +88,13 @@ if __name__ == "__main__":
# Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle # Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle
model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results) model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results)
# Save temporarly some raw results (TODO: to complete to retreive more results) # Save temporarly some raw results (TODO: to complete to retreive more results)
# Save the scores
experiment_train_scores[seed].append(model_raw_results.train_score) experiment_train_scores[seed].append(model_raw_results.train_score)
experiment_dev_scores[seed].append(model_raw_results.dev_score) experiment_dev_scores[seed].append(model_raw_results.dev_score)
experiment_test_scores[seed].append(model_raw_results.test_score) experiment_test_scores[seed].append(model_raw_results.test_score)
# Save the weights
experiment_weights[seed].append(model_raw_results.weights)
# Save the metric
experiment_score_metrics.append(model_raw_results.score_metric) experiment_score_metrics.append(model_raw_results.score_metric)
if len(set(experiment_score_metrics)) > 1: if len(set(experiment_score_metrics)) > 1:
...@@ -107,3 +115,48 @@ if __name__ == "__main__": ...@@ -107,3 +115,48 @@ if __name__ == "__main__":
all_labels=['train', 'dev', 'test'], all_labels=['train', 'dev', 'test'],
title='Loss values of the trained model' title='Loss values of the trained model'
) )
"""
TODO:
For each dataset:
Stage 1) A figure for the selection of the best base forest model hyperparameters (best vs default/random hyperparams)
Stage 2) A figure for the selection of the best dataset normalization method
Stage 3) A figure for the selection of the best combination of dataset: normalization vs D normalization vs weights normalization
Stage 4) A figure for the selection of the most relevant subsets combination: train,dev vs train+dev,train+dev vs train,train+dev
Stage 5) A figure for the selection of the best extracted forest size?
Stage 6) A figure to finally compare the perf of our approach using the previous selected parameters vs the baseline vs other papers
Stage 3)
In all axis:
- untrained forest
- trained base forest (straight line cause it doesn't depend on the number of extracted trees)
Axis 1:
- test with forest on train+dev and OMP on train+dev
- test with forest on train+dev and OMP on train+dev with dataset normalization
- test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization
- test with forest on train+dev and OMP on train+dev with dataset normalization + weights normalization
- test with forest on train+dev and OMP on train+dev with dataset normalization + D normalization + weights normalization
Axis 2:
- test with forest on train and OMP on dev
- test with forest on train and OMP on dev with dataset normalization
- test with forest on train and OMP on dev with dataset normalization + D normalization
- test with forest on train and OMP on dev with dataset normalization + weights normalization
- test with forest on train and OMP on dev with dataset normalization + D normalization + weights normalization
Axis 3:
- test with forest on train and OMP train+dev
- test with forest on train and OMP train+dev with dataset normalization
- test with forest on train and OMP train+dev with dataset normalization + D normalization
- test with forest on train and OMP train+dev with dataset normalization + weights normalization
- test with forest on train and OMP train+dev with dataset normalization + D normalization + weights normalization
IMPORTANT: Same seeds used in all axis.
"""
# Plot the density of the weights
Plotter.weight_density(
file_path=args.results_dir + os.sep + experiment_id + os.sep + 'density_weight.png',
all_experiment_weights=experiment_weights
)
...@@ -9,16 +9,75 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory ...@@ -9,16 +9,75 @@ from bolsonaro.error_handling.logger_factory import LoggerFactory
from dotenv import find_dotenv, load_dotenv from dotenv import find_dotenv, load_dotenv
import argparse import argparse
import json
import pathlib import pathlib
import random import random
import os import os
from tqdm import tqdm from concurrent import futures
import threading
import json
def process_job(seed, parameters, experiment_id, hyperparameters):
"""
Experiment function.
Will be used as base function for worker in multithreaded application.
:param seed:
:param parameters:
:param experiment_id:
:return:
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str
pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
dataset_parameters = DatasetParameters(
name=parameters['dataset_name'],
test_size=parameters['test_size'],
dev_size=parameters['dev_size'],
random_state=seed,
dataset_normalizer=parameters['dataset_normalizer']
)
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
for extracted_forest_size in parameters['extracted_forest_size']:
# question if training is too long, one may also split experiments for different forest sizes into different workers
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=extracted_forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
seed=seed,
hyperparameters=hyperparameters
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.init(model)
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
if __name__ == "__main__": if __name__ == "__main__":
# get environment variables in .env load_dotenv(find_dotenv('.env'))
load_dotenv(find_dotenv('.env.example'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
DEFAULT_DATASET_NAME = 'boston' DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = None DEFAULT_DATASET_NORMALIZER = None
...@@ -29,13 +88,15 @@ if __name__ == "__main__": ...@@ -29,13 +88,15 @@ if __name__ == "__main__":
DEFAULT_DEV_SIZE = 0.2 DEFAULT_DEV_SIZE = 0.2
DEFAULT_TEST_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1 DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_USE_DEV_SUBSET = False DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_DISABLE_PROGRESS = False DEFAULT_NORMALIZE_WEIGHTS = False
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
...@@ -46,66 +107,64 @@ if __name__ == "__main__": ...@@ -46,66 +107,64 @@ if __name__ == "__main__":
parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.') parser.add_argument('--test_size', nargs='?', type=float, default=DEFAULT_TEST_SIZE, help='Test subset ratio.')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--use_dev_subset', action='store_true', default=DEFAULT_USE_DEV_SUBSET, help='If specify the forest will be trained on train subset and OMP on dev subset. Otherwise both the forest and OMP will be trained on train+dev subsets.') parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.') parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
args = parser.parse_args() args = parser.parse_args()
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) if args.experiment_configuration:
with open(args.experiment_configuration_path + os.sep + \
args.experiment_configuration + '.json', 'r') as input_file:
parameters = json.load(input_file)
else:
parameters = args.__dict__
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
args.extracted_forest_size = args.extracted_forest_size \ # The number of tree to extract from forest (K)
if type(args.extracted_forest_size) == list \ parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \
else [args.extracted_forest_size] if type(parameters['extracted_forest_size']) == list \
else [parameters['extracted_forest_size']]
if args.seeds != None and args.random_seed_number > 1: hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') if os.path.exists(hyperparameters_path):
logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
with open(hyperparameters_path, 'r+') as file_hyperparameter:
hyperparameters = json.load(file_hyperparameter)['best_parameters']
else:
hyperparameters = {}
seeds = args.seeds if args.seeds is not None \ if parameters['forest_size'] is not None:
else [random.randint(begin_random_seed_range, end_random_seed_range) \ hyperparameters['n_estimators'] = parameters['forest_size']
for i in range(args.random_seed_number)]
experiment_id = resolve_experiment_id(args.models_dir) if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
experiment_id_str = str(experiment_id) logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
logger.info('Experiment id: {}'.format(experiment_id_str)) # Seeds are either provided as parameters or generated at random
seeds = parameters['seeds'] if parameters['seeds'] is not None \
with tqdm(seeds, disable=args.disable_progress) as seed_bar: else [random.randint(begin_random_seed_range, end_random_seed_range) \
for seed in seed_bar: for i in range(parameters['random_seed_number'])]
seed_bar.set_description('seed={}'.format(seed))
seed_str = str(seed) # Resolve the next experiment id number (last id + 1)
models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ experiment_id = resolve_experiment_id(parameters['models_dir'])
os.sep + seed_str logger.info('Experiment id: {}'.format(experiment_id))
pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
"""
dataset_parameters = DatasetParameters( If the experiment configuration isn't coming from
name=args.dataset_name, an already existing file, save it to a json file to
test_size=args.test_size, keep trace of it.
dev_size=args.dev_size, """
random_state=seed, if args.experiment_configuration is None:
dataset_normalizer=args.dataset_normalizer with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format(
experiment_id), 'w') as output_file:
json.dump(
parameters,
output_file,
indent=4
) )
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
with tqdm(args.extracted_forest_size, disable=args.disable_progress) as extracted_forest_size_bar:
for extracted_forest_size in extracted_forest_size_bar:
extracted_forest_size_bar.set_description('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
normalize_D=args.normalize_D,
use_dev_subset=args.use_dev_subset,
seed=seed
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.train(model, sub_models_dir) # Train as much job as there are seeds
with futures.ProcessPoolExecutor(len(seeds)) as executor:
list(f.result() for f in futures.as_completed(executor.submit(process_job, seed,
parameters, experiment_id, hyperparameters) for seed in seeds))
{ {
"scorer": "neg_mean_squared_error", "scorer": "neg_mean_squared_error",
<<<<<<< HEAD
"best_score_train": -11.238253315624897, "best_score_train": -11.238253315624897,
"best_score_test": -7.312532120669678, "best_score_test": -7.312532120669678,
"best_parameters": { "best_parameters": {
...@@ -9,4 +10,15 @@ ...@@ -9,4 +10,15 @@
"n_estimators": 1000 "n_estimators": 1000
}, },
"random_seed": 289 "random_seed": 289
=======
"best_score_train": -12.900217003727361,
"best_score_test": -12.682938620298733,
"best_parameters": {
"max_depth": 18,
"max_features": "sqrt",
"min_samples_leaf": 1,
"n_estimators": 1000
},
"random_seed": 883
>>>>>>> a0f7c96f51b3e1575f6db1b704579b0cf1042c42
} }
\ No newline at end of file
{
"scorer": "accuracy",
"best_score_train": 0.9576271186440678,
"best_score_test": 1.0,
"best_parameters": {
"max_depth": 20,
"max_features": "log2",
"min_samples_leaf": 1,
"n_estimators": 1000
},
"random_seed": 883
}
\ No newline at end of file
...@@ -9,6 +9,7 @@ awscli ...@@ -9,6 +9,7 @@ awscli
flake8 flake8
python-dotenv>=0.5.1 python-dotenv>=0.5.1
scikit-learn scikit-learn
git+git://github.com/darenr/scikit-optimize@master
python-dotenv python-dotenv
tqdm matplotlib
matplotlib pandas
\ No newline at end of file \ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment