Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_factory import ModelFactory
from bolsonaro.trainer import Trainer
import argparse
import pathlib
import random
if __name__ == "__main__":
default_dataset_name = 'boston'
default_normalize = False
default_forest_size = 100
default_extracted_forest_size = 10
default_results_dir = 'results'
default_models_dir = 'models'
default_dev_size = 0.2
default_test_size = 0.2
default_use_random_seed = True
default_random_seed_number = 1
begin_random_seed_range = 1
end_random_seed_range = 2000
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset_name', nargs='?', type=str, default=default_dataset_name, help='Specify the dataset. Regression: boston, diabetes, linnerud, california_housing. Classification: iris, digits, wine, breast_cancer, olivetti_faces, 20newsgroups, 20newsgroups_vectorized, lfw_people, lfw_pairs, covtype, rcv1, kddcup99.')
parser.add_argument('--normalize', action='store_true', default=default_normalize, help='Normalize the data by doing the L2 division of the pred vectors.')
parser.add_argument('--forest_size', nargs='?', type=int, default=default_forest_size, help='The number of trees of the random forest.')
parser.add_argument('--extracted_forest_size', nargs='+', type=int, default=default_extracted_forest_size, help='The number of trees selected by OMP.')
parser.add_argument('--results_dir', nargs='?', type=str, default=default_results_dir, help='The output directory of the results.')
parser.add_argument('--models_dir', nargs='?', type=str, default=default_models_dir, help='The output directory of the trained models.')
parser.add_argument('--dev_size', nargs='?', type=float, default=default_dev_size, help='Dev subset ratio')
parser.add_argument('--test_size', nargs='?', type=float, default=default_test_size, help='Test subset ratio')
parser.add_argument('--use_random_seed', action='store_true', default=default_use_random_seed, help='Random seed used for the data split')
parser.add_argument('--random_seed_number', nargs='?', type=int, default=default_random_seed_number, help='Number of random seeds used')
args = parser.parse_args()
pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)
pathlib.Path(args.models_dir).mkdir(parents=True, exist_ok=True)
random_seeds = [random.randint(begin_random_seed_range, end_random_seed_range) for i in range(args.random_seed_number)] \
if args.use_random_seed else None
for random_seed in random_seeds:
dataset = DatasetLoader.load_from_name(
DatasetParameters(
name=args.dataset_name,
test_size=args.test_size,
dev_size=args.dev_size,
random_state=random_seed,
normalize=args.normalize
)
)
for extracted_forest_size in args.extracted_forest_size:
model = ModelFactory(
task=dataset.task,
forest_size=args.forest_size,
extracted_forest_size=extracted_forest_size,
seed=random_seed
)
trainer = Trainer(
dataset=dataset,
model=model,
results_dir=args.results_dir,
models_dir=args.models_dir
)
trainer.process()