Skip to content
Snippets Groups Projects

Resolve "integration-sota"

Merged Charly Lamothe requested to merge 15-integration-sota into master
1 file
+ 48
19
Compare changes
  • Side-by-side
  • Inline
+ 77
36
@@ -21,7 +21,7 @@ import numpy as np
import shutil
def process_job(seed, parameters, experiment_id, hyperparameters):
def seed_job(seed_job_pb, seed, parameters, experiment_id, hyperparameters, verbose):
"""
Experiment function.
@@ -34,7 +34,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
"""
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
@@ -55,13 +54,31 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer = Trainer(dataset)
if parameters['extraction_strategy'] != 'none':
for extracted_forest_size in parameters['extracted_forest_size']:
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
with tqdm_joblib(tqdm(total=len(parameters['extracted_forest_size']), disable=not verbose)) as extracted_forest_size_job_pb:
Parallel(n_jobs=-1)(delayed(extracted_forest_size_job)(extracted_forest_size_job_pb, parameters['extracted_forest_size'][i],
models_dir, seed, parameters, dataset, hyperparameters, experiment_id, trainer)
for i in range(len(parameters['extracted_forest_size'])))
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
continue
else:
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
if already_exists:
logger.info('Base forest result already exists. Skipping...')
else:
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=extracted_forest_size,
extracted_forest_size=forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
@@ -76,14 +93,36 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info(f'Training done for seed {seed_str}')
seed_job_pb.update(1)
def extracted_forest_size_job(extracted_forest_size_job_pb, extracted_forest_size, models_dir,
seed, parameters, dataset, hyperparameters, experiment_id, trainer):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_extracted_forest_size{}_ti{}'.format(
seed, extracted_forest_size, threading.get_ident()))
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_sizes' + os.sep + str(extracted_forest_size)
# Check if the result file already exists
already_exists = False
if os.path.isdir(sub_models_dir):
sub_models_dir_files = os.listdir(sub_models_dir)
for file_name in sub_models_dir_files:
if '.pickle' != os.path.splitext(file_name)[1]:
return
else:
forest_size = hyperparameters['n_estimators']
logger.info('Base forest training with fixed forest size of {}'.format(forest_size))
sub_models_dir = models_dir + os.sep + 'forest_size' + os.sep + str(forest_size)
already_exists = os.path.getsize(os.path.join(sub_models_dir, file_name)) > 0
break
if already_exists:
logger.info(f'Extracted forest {extracted_forest_size} result already exists. Skipping...')
return
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
extracted_forest_size=forest_size,
extracted_forest_size=extracted_forest_size,
normalize_D=parameters['normalize_D'],
subsets_used=parameters['subsets_used'],
normalize_weights=parameters['normalize_weights'],
@@ -98,7 +137,6 @@ def process_job(seed, parameters, experiment_id, hyperparameters):
trainer.init(model, subsets_used=parameters['subsets_used'])
trainer.train(model)
trainer.compute_results(model, sub_models_dir)
logger.info('Training done')
"""
Command lines example for stage 1:
@@ -138,6 +176,7 @@ if __name__ == "__main__":
DEFAULT_SKIP_BEST_HYPERPARAMS = False
DEFAULT_JOB_NUMBER = -1
DEFAULT_EXTRACTION_STRATEGY = 'omp'
DEFAULT_OVERWRITE = False
begin_random_seed_range = 1
end_random_seed_range = 2000
@@ -163,7 +202,8 @@ if __name__ == "__main__":
parser.add_argument('--skip_best_hyperparams', action='store_true', default=DEFAULT_SKIP_BEST_HYPERPARAMS, help='Do not use the best hyperparameters if there exist.')
parser.add_argument('--save_experiment_configuration', nargs='+', default=None, help='Save the experiment parameters specified in the command line in a file. Args: {{stage_num}} {{name}}')
parser.add_argument('--job_number', nargs='?', type=int, default=DEFAULT_JOB_NUMBER, help='Specify the number of job used during the parallelisation across seeds.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none or similarity.')
parser.add_argument('--extraction_strategy', nargs='?', type=str, default=DEFAULT_EXTRACTION_STRATEGY, help='Specify the strategy to apply to extract the trees from the forest. Either omp, random, none, similarity, kmeans.')
parser.add_argument('--overwrite', action='store_true', default=DEFAULT_OVERWRITE, help='Overwrite the experiment id')
args = parser.parse_args()
if args.experiment_configuration:
@@ -173,7 +213,7 @@ if __name__ == "__main__":
else:
parameters = args.__dict__
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity']:
if parameters['extraction_strategy'] not in ['omp', 'random', 'none', 'similarity', 'kmeans']:
raise ValueError('Specified extraction strategy {} is not supported.'.format(parameters.extraction_strategy))
pathlib.Path(parameters['models_dir']).mkdir(parents=True, exist_ok=True)
@@ -220,6 +260,7 @@ if __name__ == "__main__":
if args.experiment_id:
experiment_id = args.experiment_id
if args.overwrite:
shutil.rmtree(os.path.join(parameters['models_dir'], str(experiment_id)), ignore_errors=True)
else:
# Resolve the next experiment id number (last id + 1)
@@ -255,6 +296,6 @@ if __name__ == "__main__":
)
# Run as much job as there are seeds
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as progress_bar:
Parallel(n_jobs=args.job_number)(delayed(process_job)(seeds[i],
parameters, experiment_id, hyperparameters) for i in range(len(seeds)))
with tqdm_joblib(tqdm(total=len(seeds), disable=not args.verbose)) as seed_job_pb:
Parallel(n_jobs=args.job_number)(delayed(seed_job)(seed_job_pb, seeds[i],
parameters, experiment_id, hyperparameters, args.verbose) for i in range(len(seeds)))
Loading