Skip to content
Snippets Groups Projects
Commit 789a11a6 authored by Charly LAMOTHE's avatar Charly LAMOTHE
Browse files

- Add experiment_configuration parameter to run an experiment from a json...

- Add experiment_configuration parameter to run an experiment from a json configuration file. If the experiment configuration are commnig from the arguments, save it to a file to keep trace of it;
- Add few comments in train.py.
parent cb0030d8
No related branches found
No related tags found
1 merge request!3clean scripts
...@@ -14,24 +14,25 @@ import random ...@@ -14,24 +14,25 @@ import random
import os import os
from concurrent import futures from concurrent import futures
import threading import threading
import json
def process_job(seed, args, experiment_id): def process_job(seed, parameters, experiment_id):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident())) seed, threading.get_ident()))
logger.info('seed={}'.format(seed)) logger.info('seed={}'.format(seed))
seed_str = str(seed) seed_str = str(seed)
experiment_id_str = str(experiment_id) experiment_id_str = str(experiment_id)
models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str os.sep + seed_str
pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
dataset_parameters = DatasetParameters( dataset_parameters = DatasetParameters(
name=args.dataset_name, name=parameters['dataset_name'],
test_size=args.test_size, test_size=parameters['test_size'],
dev_size=args.dev_size, dev_size=parameters['dev_size'],
random_state=seed, random_state=seed,
dataset_normalizer=args.dataset_normalizer dataset_normalizer=parameters['dataset_normalizer']
) )
dataset_parameters.save(models_dir, experiment_id_str) dataset_parameters.save(models_dir, experiment_id_str)
...@@ -39,17 +40,17 @@ def process_job(seed, args, experiment_id): ...@@ -39,17 +40,17 @@ def process_job(seed, args, experiment_id):
trainer = Trainer(dataset) trainer = Trainer(dataset)
for extracted_forest_size in args.extracted_forest_size: for extracted_forest_size in parameters['extracted_forest_size']:
logger.info('extracted_forest_size={}'.format(extracted_forest_size)) logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters( model_parameters = ModelParameters(
forest_size=args.forest_size, forest_size=parameters['forest_size'],
extracted_forest_size=extracted_forest_size, extracted_forest_size=extracted_forest_size,
normalize_D=args.normalize_D, normalize_D=parameters['normalize_D'],
subsets_used=args.subsets_used, subsets_used=parameters['subsets_used'],
normalize_weights=args.normalize_weights, normalize_weights=parameters['normalize_weights'],
seed=seed seed=seed
) )
model_parameters.save(sub_models_dir, experiment_id) model_parameters.save(sub_models_dir, experiment_id)
...@@ -63,6 +64,7 @@ if __name__ == "__main__": ...@@ -63,6 +64,7 @@ if __name__ == "__main__":
# get environment variables in .env # get environment variables in .env
load_dotenv(find_dotenv('.env.example')) load_dotenv(find_dotenv('.env.example'))
DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
DEFAULT_DATASET_NAME = 'boston' DEFAULT_DATASET_NAME = 'boston'
DEFAULT_NORMALIZE_D = False DEFAULT_NORMALIZE_D = False
DEFAULT_DATASET_NORMALIZER = None DEFAULT_DATASET_NORMALIZER = None
...@@ -74,12 +76,14 @@ if __name__ == "__main__": ...@@ -74,12 +76,14 @@ if __name__ == "__main__":
DEFAULT_TEST_SIZE = 0.2 DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1 DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_SUBSETS_USED = 'train,dev' DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_normalize_weights = False DEFAULT_NORMALIZE_WEIGHTS = False
begin_random_seed_range = 1 begin_random_seed_range = 1
end_random_seed_range = 2000 end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
...@@ -91,28 +95,50 @@ if __name__ == "__main__": ...@@ -91,28 +95,50 @@ if __name__ == "__main__":
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_normalize_weights, help='Divide the predictions by the weights sum.') parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
args = parser.parse_args() args = parser.parse_args()
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) if args.experiment_configuration:
with open(args.experiment_configuration_path + os.sep + \
args.experiment_configuration + '.json', 'r') as input_file:
parameters = json.load(input_file)
else:
parameters = args.__dict__
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
args.extracted_forest_size = args.extracted_forest_size \ parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \
if type(args.extracted_forest_size) == list \ if type(parameters['extracted_forest_size']) == list \
else [args.extracted_forest_size] else [parameters['extracted_forest_size']]
if args.seeds != None and args.random_seed_number > 1: if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')
seeds = args.seeds if args.seeds is not None \ seeds = parameters['seeds'] if parameters['seeds'] is not None \
else [random.randint(begin_random_seed_range, end_random_seed_range) \ else [random.randint(begin_random_seed_range, end_random_seed_range) \
for i in range(args.random_seed_number)] for i in range(parameters['random_seed_number'])]
experiment_id = resolve_experiment_id(args.models_dir)
# Resolve the next experiment id number (last id + 1)
experiment_id = resolve_experiment_id(parameters['models_dir'])
logger.info('Experiment id: {}'.format(experiment_id)) logger.info('Experiment id: {}'.format(experiment_id))
"""
If the experiment configuration isn't comming from
an already existing file, save it to a json file to
keep trace of it.
"""
if args.experiment_configuration is None:
with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format(
experiment_id), 'w') as output_file:
json.dump(
parameters,
output_file,
indent=4
)
# Train as much job as there are seeds
with futures.ProcessPoolExecutor(len(seeds)) as executor: with futures.ProcessPoolExecutor(len(seeds)) as executor:
list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, list(f.result() for f in futures.as_completed(executor.submit(process_job, seed,
args, experiment_id) for seed in seeds)) parameters, experiment_id) for seed in seeds))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment