Skip to content
Snippets Groups Projects
Commit cb0030d8 authored by Charly LAMOTHE's avatar Charly LAMOTHE
Browse files

- Compute each computations of a given seed in a dedicated job;

- Use as much CPU as possible when training a random forest regressor.
parent 7dd2aab5
No related branches found
No related tags found
1 merge request!3clean scripts
......@@ -13,7 +13,7 @@ class OmpForestRegressor(BaseEstimator):
def __init__(self, models_parameters):
self._regressor = RandomForestRegressor(n_estimators=models_parameters.forest_size,
random_state=models_parameters.seed)
random_state=models_parameters.seed, n_jobs=-1)
self._models_parameters = models_parameters
self._logger = LoggerFactory.create(LOG_PATH, __name__)
......
......@@ -12,9 +12,53 @@ import argparse
import pathlib
import random
import os
from tqdm import tqdm
from concurrent import futures
import threading
def process_job(seed, args, experiment_id):
logger = LoggerFactory.create(LOG_PATH, 'training_seed{}_ti{}'.format(
seed, threading.get_ident()))
logger.info('seed={}'.format(seed))
seed_str = str(seed)
experiment_id_str = str(experiment_id)
models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str
pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
dataset_parameters = DatasetParameters(
name=args.dataset_name,
test_size=args.test_size,
dev_size=args.dev_size,
random_state=seed,
dataset_normalizer=args.dataset_normalizer
)
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
for extracted_forest_size in args.extracted_forest_size:
logger.info('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
normalize_D=args.normalize_D,
subsets_used=args.subsets_used,
normalize_weights=args.normalize_weights,
seed=seed
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.train(model, sub_models_dir)
logger.info('Training done')
if __name__ == "__main__":
# get environment variables in .env
load_dotenv(find_dotenv('.env.example'))
......@@ -30,7 +74,6 @@ if __name__ == "__main__":
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_SEED_NUMBER = 1
DEFAULT_SUBSETS_USED = 'train,dev'
DEFAULT_DISABLE_PROGRESS = False
DEFAULT_normalize_weights = False
begin_random_seed_range = 1
......@@ -48,7 +91,6 @@ if __name__ == "__main__":
parser.add_argument('--random_seed_number', nargs='?', type=int, default=DEFAULT_RANDOM_SEED_NUMBER, help='Number of random seeds used.')
parser.add_argument('--seeds', nargs='+', type=int, default=None, help='Specific a list of seeds instead of generate them randomly')
parser.add_argument('--subsets_used', nargs='+', type=str, default=DEFAULT_SUBSETS_USED, help='train,dev: forest on train, OMP on dev. train+dev,train+dev: both forest and OMP on train+dev. train,train+dev: forest on train+dev and OMP on dev.')
parser.add_argument('--disable_progress', action='store_true', default=DEFAULT_DISABLE_PROGRESS, help='Disable the progress bars.')
parser.add_argument('--normalize_weights', action='store_true', default=DEFAULT_normalize_weights, help='Divide the predictions by the weights sum.')
args = parser.parse_args()
......@@ -68,47 +110,9 @@ if __name__ == "__main__":
for i in range(args.random_seed_number)]
experiment_id = resolve_experiment_id(args.models_dir)
experiment_id_str = str(experiment_id)
logger.info('Experiment id: {}'.format(experiment_id_str))
with tqdm(seeds, disable=args.disable_progress) as seed_bar:
for seed in seed_bar:
seed_bar.set_description('seed={}'.format(seed))
seed_str = str(seed)
models_dir = args.models_dir + os.sep + experiment_id_str + os.sep + 'seeds' + \
os.sep + seed_str
pathlib.Path(models_dir).mkdir(parents=True, exist_ok=True)
dataset_parameters = DatasetParameters(
name=args.dataset_name,
test_size=args.test_size,
dev_size=args.dev_size,
random_state=seed,
dataset_normalizer=args.dataset_normalizer
)
dataset_parameters.save(models_dir, experiment_id_str)
dataset = DatasetLoader.load(dataset_parameters)
trainer = Trainer(dataset)
with tqdm(args.extracted_forest_size, disable=args.disable_progress) as extracted_forest_size_bar:
for extracted_forest_size in extracted_forest_size_bar:
extracted_forest_size_bar.set_description('extracted_forest_size={}'.format(extracted_forest_size))
sub_models_dir = models_dir + os.sep + 'extracted_forest_size' + os.sep + str(extracted_forest_size)
pathlib.Path(sub_models_dir).mkdir(parents=True, exist_ok=True)
model_parameters = ModelParameters(
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
normalize_D=args.normalize_D,
subsets_used=args.subsets_used,
normalize_weights=args.normalize_weights,
seed=seed
)
model_parameters.save(sub_models_dir, experiment_id)
model = ModelFactory.build(dataset.task, model_parameters)
trainer.train(model, sub_models_dir)
logger.info('Experiment id: {}'.format(experiment_id))
with futures.ProcessPoolExecutor(len(seeds)) as executor:
list(f.result() for f in futures.as_completed(executor.submit(process_job, seed,
args, experiment_id) for seed in seeds))
......@@ -10,6 +10,5 @@ flake8
python-dotenv>=0.5.1
scikit-learn
python-dotenv
tqdm
matplotlib
pandas
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment