train.py 19.2 KB
Newer Older
1
2
3
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
4
from bolsonaro.models.model_parameters import ModelParameters
5
from bolsonaro.models.ensemble_selection_forest_regressor import EnsembleSelectionForestRegressor
6
from bolsonaro.trainer import Trainer
7
from bolsonaro.utils import resolve_experiment_id, tqdm_joblib
8
9
from bolsonaro import LOG_PATH
from bolsonaro.error_handling.logger_factory import LoggerFactory
10

11
from dotenv import find_dotenv, load_dotenv
12
import argparse
13
import json
14
15
import pathlib
import random
16
import os
17
from joblib import Parallel, delayed
18
import threading
19
import json
20
from tqdm import tqdm
21
import numpy as np
22
import shutil
23
24


25
def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
26
27
28
29
30
31
32
33
34
35
    """
    Experiment function.

    Will be used as base function for worker in multithreaded application.

    :param seed:
    :param parameters:
    :param experiment_id:
    :return:
    """
36
37
    logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
        seed, threading.get_ident()))
38

39
40
    seed_str = str(seed)
    experiment_id_str = str(experiment_id)
41
    models_dir = parameters['models_dir'] + os.sep + experiment_id_str + os.sep + 'seeds' + \
42
43
44
45
        os.sep + seed_str
    pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)

    dataset_parameters = DatasetParameters(
46
47
48
        name=parameters['dataset_name'],
        test_size=parameters['test_size'],
        dev_size=parameters['dev_size'],
49
        random_state=seed,
50
        dataset_normalizer=parameters['dataset_normalizer']
51
52
53
54
55
56
    )
    dataset_parameters.save(models_dir, experiment_id_str)
    dataset = DatasetLoader.load(dataset_parameters)

    trainer = Trainer(dataset)

57
58
59
60
61
    if parameters['extraction_strategy'] == 'ensemble':
        library = EnsembleSelectionForestRegressor.generate_library(dataset.X_train, dataset.y_train, random_state=seed)
    else:
        library = None

62
    if parameters['extraction_strategy'] != 'none':
63
64
        with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
            Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
65
                models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer, library)
66
                for i in range(len(parameters['extracted_forest_size'])))
67
68
69
    else:
        forest_size = hyperparameters['n_estimators']
        logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
70
        sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
71

72
73
74
75
76
77
78
79
80
81
82
83
84
85
        # Check if the result file already exists
        already_exists = False
        if os.path.isdir(sub_models_dir):
            sub_models_dir_files = os.listdir(sub_models_dir)
            for file_name in sub_models_dir_files:
                if '.pickle' != os.path.splitext(file_name)[1]:
                    continue
                else:
                    already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
                    break
        if already_exists:
            logger.info('Base forest result already exists. Skipping...')
        else:
            pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
86
            model_parameters = ModelParameters(
87
                extracted_forest_size=forest_size,
88
89
90
91
92
93
94
95
96
                normalize_D=parameters['normalize_D'],
                subsets_used=parameters['subsets_used'],
                normalize_weights=parameters['normalize_weights'],
                seed=seed,
                hyperparameters=hyperparameters,
                extraction_strategy=parameters['extraction_strategy']
            )
            model_parameters.save(sub_models_dir, experiment_id)

97
            model = ModelFactory.build(dataset.task, model_parameters, library=library)
98

99
            trainer.init(model, subsets_used=parameters['subsets_used'])
100
101
            trainer.train(model)
            trainer.compute_results(model, sub_models_dir)
102
103
104
105
    logger.info(f'Training done for seed {seed_str}')
    seed_job_pb.update(1)

def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
106
    seed, parameters, dataset, hyperparameters, experiment_id, trainer, library):
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140

    logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
        seed, extracted_forest_size, threading.get_ident()))
    logger.info('extracted_forest_size={}'.format(extracted_forest_size))

    sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)

    # Check if the result file already exists
    already_exists = False
    if os.path.isdir(sub_models_dir):
        sub_models_dir_files = os.listdir(sub_models_dir)
        for file_name in sub_models_dir_files:
            if '.pickle' != os.path.splitext(file_name)[1]:
                return
            else:
                already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
                break
    if already_exists:
        logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
        return

    pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)

    model_parameters = ModelParameters(
        extracted_forest_size=extracted_forest_size,
        normalize_D=parameters['normalize_D'],
        subsets_used=parameters['subsets_used'],
        normalize_weights=parameters['normalize_weights'],
        seed=seed,
        hyperparameters=hyperparameters,
        extraction_strategy=parameters['extraction_strategy']
    )
    model_parameters.save(sub_models_dir, experiment_id)

141
    model = ModelFactory.build(dataset.task, model_parameters, library=library)
142
143
144
145

    trainer.init(model, subsets_used=parameters['subsets_used'])
    trainer.train(model)
    trainer.compute_results(model, sub_models_dir)
146

147
"""
148
Command lines example for stage 1:
149
150
151
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05
152
153
154
155
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --extracted_forest_size_stop=0.05
python code/compute_results.py --stage 1 --experiment_ids 1 2 3 4 5 6 --dataset_name=california_housing
156

157
Command lines example for stage 2:
158
159
160
161
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 no_normalization --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D --normalize_D --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_weights --normalize_weights --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 2 normalize_D_and_weights --normalize_D --normalize_weights --extracted_forest_size_stop=0.05
162
python code/compute_results.py --stage 2 --experiment_ids 7 8 9 10 --dataset_name=california_housing
163
164
165
166
167

Command lines example for stage 3:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-dev_train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 3 train-train-dev_subset --extracted_forest_size_stop=0.05 --subsets_used train,train+dev
168
python code/compute_results.py --stage 3 --experiment_ids 11 12 13 --dataset_name=california_housing
Charly Lamothe's avatar
Charly Lamothe committed
169
170
171
172
173
174

Command lines example for stage 4:
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=none --save_experiment_configuration 4 none_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --extraction_strategy=random --save_experiment_configuration 4 random_with_params --extracted_forest_size_stop=0.05
python code/train.py --dataset_name=california_housing --seeds 1 2 3 4 5 --save_experiment_configuration 4 omp_with_params --extracted_forest_size_stop=0.05 --subsets_used train+dev,train+dev
python code/compute_results.py --stage 4 --experiment_ids 1 2 3 --dataset_name=california_housing
175
"""
176
if __name__ == "__main__":
177
    load_dotenv(find_dotenv('.env'))
178
    DEFAULT_EXPERIMENT_CONFIGURATION_PATH = 'experiments'
179
    # the models will be stored in a directory structure like: models/{experiment_id}/seeds/{seed_nb}/extracted_forest_sizes/{extracted_forest_size}
180
    DEFAULT_MODELS_DIR = os.environ['project_dir'] + os.sep + 'models'
181
    DEFAULT_VERBOSE = False
182
183
    DEFAULT_SKIP_BEST_HYPERPARAMS = False
    DEFAULT_JOB_NUMBER = -1
184
    DEFAULT_EXTRACTION_STRATEGY = 'omp'
185
    DEFAULT_OVERWRITE = False
Charly LAMOTHE's avatar
Charly LAMOTHE committed
186

187
188
189
190
    begin_random_seed_range = 1
    end_random_seed_range = 2000

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
191
    parser.add_argument('--experiment_id', nargs='?', type=int, default=None, help='Specify an experiment id. Remove already existing model with this specified experiment id.')
192
193
    parser.add_argument('--experiment_configuration', nargs='?', type=str, default=None, help='Specify an experiment configuration file name. Overload all other parameters.')
    parser.add_argument('--experiment_configuration_path', nargs='?', type=str, default=DEFAULT_EXPERIMENT_CONFIGURATION_PATH, help='Specify the experiment configuration directory path.')
194
195
196
    parser.add_argument('--dataset_name', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NAME, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
    parser.add_argument('--normalize_D', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_D, help='Specify if we want to normalize the prediction of the forest by doing the L2 division of the pred vectors.')
    parser.add_argument('--dataset_normalizer', nargs='?', type=str, default=DatasetLoader.DEFAULT_DATASET_NORMALIZER, help='Specify which dataset normalizer use (either standard, minmax, robust or normalizer).')
197
198
    parser.add_argument('--forest_size', nargs='?', type=int, default=None, help='The number of trees of the random forest.')
    parser.add_argument('--extracted_forest_size_samples', nargs='?', type=int, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_SAMPLES, help='The number of extracted forest sizes (proportional to the forest size) selected by OMP.')
199
    parser.add_argument('--extracted_forest_size_stop', nargs='?', type=float, default=DatasetLoader.DEFAULT_EXTRACTED_FOREST_SIZE_STOP, help='Specify the upper bound of the extracted forest sizes linspace.')
Charly LAMOTHE's avatar
Charly LAMOTHE committed
200
    parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
201
202
203
    parser.add_argument('--dev_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_DEV_SIZE, help='Dev subset ratio.')
    parser.add_argument('--test_size', nargs='?', type=float, default=DatasetLoader.DEFAULT_TEST_SIZE, help='Test subset ratio.')
    parser.add_argument('--random_seed_number', nargs='?', type=int, default=DatasetLoader.DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
204
    parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
205
    parser.add_argument('--subsets_used', nargs='?', type=str, default=DatasetLoader.DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
206
    parser.add_argument('--normalize_weights', action='store_true', default=DatasetLoader.DEFAULT_NORMALIZE_WEIGHTS, help='Divide the predictions by the weights sum.')
207
    parser.add_argument('--verbose', action='store_true', default=DEFAULT_VERBOSE, help='Print tqdm progress bar.')
208
209
210
    parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
    parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
    parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
211
    parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans, ensemble.')
212
    parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
213
214
    args = parser.parse_args()

215
216
217
218
219
220
221
    if args.experiment_configuration:
        with open(args.experiment_configuration_path + os.sep + \
            args.experiment_configuration + '.json', 'r') as input_file:
            parameters = json.load(input_file)
    else:
        parameters = args.__dict__

222
    if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans', 'ensemble']:
223
224
        raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))

225
    pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
226

227
    logger = LoggerFactory.create(LOG_PATH, os.path.basename(__file__))
228

Charly Lamothe's avatar
Charly Lamothe committed
229
    hyperparameters_path = os.path.join('experiments', args.dataset_name, 'stage1', 'params.json')
Charly Lamothe's avatar
Charly Lamothe committed
230
    if os.path.exists(hyperparameters_path):
Charly Lamothe's avatar
Charly Lamothe committed
231
232
        logger.info("Hyperparameters found for this dataset at '{}'".format(hyperparameters_path))
        with open(hyperparameters_path, 'r+') as file_hyperparameter:
Charly Lamothe's avatar
Charly Lamothe committed
233
234
235
236
237
            loaded_hyperparameters = json.load(file_hyperparameter)['best_parameters']
            if args.skip_best_hyperparams:
                hyperparameters = {'n_estimators': loaded_hyperparameters['n_estimators']}
            else:
                hyperparameters = loaded_hyperparameters
238
239
240
    else:
        hyperparameters = {}

241
242
243
244
245
246
247
248
249
250
    """
    First case: no best hyperparameters are specified and no forest_size parameter
    specified in argument, so use the DEFAULT_FOREST_SIZE.
    Second case: no matter if hyperparameters are specified, the forest_size parameter
    will override it.
    Third implicit case: use the number of estimators found in specified hyperparameters.
    """
    if len(hyperparameters) == 0 and parameters['forest_size'] is None:
        hyperparameters['n_estimators'] = DatasetLoader.DEFAULT_FOREST_SIZE
    elif parameters['forest_size'] is not None:
Charly Lamothe's avatar
Charly Lamothe committed
251
252
        hyperparameters['n_estimators'] = parameters['forest_size']

253
    # The number of tree to extract from forest (K)
254
    parameters['extracted_forest_size'] = np.unique(np.around(hyperparameters['n_estimators'] *
255
256
        np.linspace(0, args.extracted_forest_size_stop,
        parameters['extracted_forest_size_samples'] + 1,
Léo Bouscarrat's avatar
Léo Bouscarrat committed
257
        endpoint=True)[1:]).astype(np.int)).tolist()
258

259
    if parameters['seeds'] != None and parameters['random_seed_number'] > 1:
260
261
        logger.warning('seeds and random_seed_number parameters are both specified. Seeds will be used.')    

262
    # Seeds are either provided as parameters or generated at random
263
    seeds = parameters['seeds'] if parameters['seeds'] is not None \
264
        else [random.randint(begin_random_seed_range, end_random_seed_range) \
265
        for i in range(parameters['random_seed_number'])]
266

267
268
    if args.experiment_id:
        experiment_id = args.experiment_id
269
270
        if args.overwrite:
            shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
271
272
273
    else:
        # Resolve the next experiment id number (last id + 1)
        experiment_id = resolve_experiment_id(parameters['models_dir'])
274
275
    logger.info('Experiment id: {}'.format(experiment_id))

276
    """
277
    If the experiment configuration isn't coming from
278
    an already existing file, save it to a json file to
279
    keep trace of it (either a specified path, either in 'unnamed' dir.).
280
281
    """
    if args.experiment_configuration is None:
282
283
284
        if args.save_experiment_configuration:
            if len(args.save_experiment_configuration) != 2:
                raise ValueError('save_experiment_configuration must have two parameters.')
Charly Lamothe's avatar
Charly Lamothe committed
285
286
            elif int(args.save_experiment_configuration[0]) not in list(range(1, 6)):
                raise ValueError('save_experiment_configuration first parameter must be a supported stage id (i.e. [1, 5]).')
287
288
289
290
            output_experiment_stage_path = os.path.join(args.experiment_configuration_path,
                args.dataset_name, 'stage' + args.save_experiment_configuration[0])
            pathlib.Path(output_experiment_stage_path).mkdir(parents=True, exist_ok=True)
            output_experiment_configuration_path = os.path.join(output_experiment_stage_path,
291
                args.save_experiment_configuration[1] + '.json')
292
293
294
295
296
297
        else:
            pathlib.Path(os.path.join(args.experiment_configuration_path, 'unnamed')).mkdir(parents=True, exist_ok=True)
            output_experiment_configuration_path = os.path.join(
                args.experiment_configuration_path, 'unnamed', 'unnamed_{}.json'.format(
                experiment_id))
        with open(output_experiment_configuration_path, 'w') as output_file:
298
299
300
301
302
303
            json.dump(
                parameters,
                output_file,
                indent=4
            )

304
    # Run as much job as there are seeds
305
306
307
    with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb:
        Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i],
            parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))