diff --git a/code/train.py b/code/train.py index 2d3264d6755c64be3c7bdb1b09785fa02266a2a1..dca0b2156267e9108082bf7051f02981badfbe07 100644 --- a/code/train.py +++ b/code/train.py @@ -14,24 +14,25 @@ import random import os from concurrent import futures import threading +import json -def process_job(seed, args, experiment_id): +def process_job(seed, parameters, experiment_id): logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format( seed, threading.get_ident())) logger.info('seed={}'.format(seed)) seed_str = str(seed) experiment_id_str = str(experiment_id) - models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \ + models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \ os.sep + seed_str pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True) dataset_parameters = DatasetParameters( - name=args.dataset_name, - test_size=args.test_size, - dev_size=args.dev_size, + name=parameters['dataset_name'], + test_size=parameters['test_size'], + dev_size=parameters['dev_size'], random_state=seed, - dataset_normalizer=args.dataset_normalizer + dataset_normalizer=parameters['dataset_normalizer'] ) dataset_parameters.save(models_dir, experiment_id_str) @@ -39,17 +40,17 @@ def process_job(seed, args, experiment_id): trainer = Trainer(dataset) - for extracted_forest_size in args.extracted_forest_size: + for extracted_forest_size in parameters['extracted_forest_size']: logger.info('extracted_forest_size={}'.format(extracted_forest_size)) sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size) pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True) model_parameters = ModelParameters( - forest_size=args.forest_size, + forest_size=parameters['forest_size'], extracted_forest_size=extracted_forest_size, - normalize_D=args.normalize_D, - subsets_used=args.subsets_used, - normalize_weights=args.normalize_weights, + normalize_D=parameters['normalize_D'], + subsets_used=parameters['subsets_used'], + normalize_weights=parameters['normalize_weights'], seed=seed ) model_parameters.save(sub_models_dir, experiment_id) @@ -63,6 +64,7 @@ if __name__ == "__main__": # get environment variables in .env load_dotenv(find_dotenv('.env.example')) + DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments' DEFAULT_DATASET_NAME = 'boston' DEFAULT_NORMALIZE_D = False DEFAULT_DATASET_NORMALIZER = None @@ -74,12 +76,14 @@ if __name__ == "__main__": DEFAULT_TEST_SIZE = 0.2 DEFAULT_RANDOM_SEED_NUMBER = 1 DEFAULT_SUBSETS_USED = 'train,dev' - DEFAULT_normalize_weights = False + DEFAULT_NORMALIZE_WEIGHTS = False begin_random_seed_range = 1 end_random_seed_range = 2000 parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.') + parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.') parser.add_argument('--dataset_name', nargs='?', type=str, default=DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.') parser.add_argument('--normalize_D', action='store_true', default=DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.') parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).') @@ -91,28 +95,50 @@ if __name__ == "__main__": parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.') parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly') parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.') - parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_normalize_weights, help='Divide the predictions by the weights sum.') + parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.') args = parser.parse_args() - pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True) + if args.experiment_configuration: + with open(args.experiment_configuration_path + os.sep + \ + args.experiment_configuration + '.json', 'r') as input_file: + parameters = json.load(input_file) + else: + parameters = args.__dict__ + + pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True) logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__)) - args.extracted_forest_size = args.extracted_forest_size \ - if type(args.extracted_forest_size) == list \ - else [args.extracted_forest_size] + parameters['extracted_forest_size'] = parameters['extracted_forest_size'] \ + if type(parameters['extracted_forest_size']) == list \ + else [parameters['extracted_forest_size']] - if args.seeds != None and args.random_seed_number > 1: + if parameters['seeds'] != None and parameters['random_seed_number'] > 1: logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.') - seeds = args.seeds if args.seeds is not None \ + seeds = parameters['seeds'] if parameters['seeds'] is not None \ else [random.randint(begin_random_seed_range, end_random_seed_range) \ - for i in range(args.random_seed_number)] - - experiment_id = resolve_experiment_id(args.models_dir) + for i in range(parameters['random_seed_number'])] + # Resolve the next experiment id number (last id + 1) + experiment_id = resolve_experiment_id(parameters['models_dir']) logger.info('Experiment id: {}'.format(experiment_id)) + """ + If the experiment configuration isn't comming from + an already existing file, save it to a json file to + keep trace of it. + """ + if args.experiment_configuration is None: + with open(args.experiment_configuration_path + os.sep + 'unnamed_{}.json'.format( + experiment_id), 'w') as output_file: + json.dump( + parameters, + output_file, + indent=4 + ) + + # Train as much job as there are seeds with futures.ProcessPoolExecutor(len(seeds)) as executor: list(f.result() for f in futures.as_completed(executor.submit(process_job, seed, - args, experiment_id) for seed in seeds)) + parameters, experiment_id) for seed in seeds))