compute_results.py 5.85 KB
Newer Older
1
2
3
4
from bolsonaro.data.dataset_parameters import DatasetParameters
from bolsonaro.data.dataset_loader import DatasetLoader
from bolsonaro.models.model_raw_results import ModelRawResults
from bolsonaro.models.model_factory import ModelFactory
5
from bolsonaro.visualization.plotter import Plotter
6

7
8
import argparse
import pathlib
9
10
from dotenv import find_dotenv, load_dotenv
import os
11
12
13


if __name__ == "__main__":
14
15
16
    # get environment variables in .env
    load_dotenv(find_dotenv('.env.example'))

Charly LAMOTHE's avatar
Charly LAMOTHE committed
17
18
19
    DEFAULT_RESULTS_DIR = os.environ["project_dir"] + os.sep + 'results'
    DEFAULT_MODELS_DIR = os.environ["project_dir"] + os.sep + 'models'
    DEFAULT_EXPERIMENT_IDS = None
20
21

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
Charly LAMOTHE's avatar
Charly LAMOTHE committed
22
23
24
    parser.add_argument('--results_dir', nargs='?', type=str, default=DEFAULT_RESULTS_DIR, help='The output directory of the results.')
    parser.add_argument('--models_dir', nargs='?', type=str, default=DEFAULT_MODELS_DIR, help='The output directory of the trained models.')
    parser.add_argument('--experiment_ids', nargs='+', type=int, default=DEFAULT_EXPERIMENT_IDS, help='Compute the results of the specified experiment id(s)')
25
26
    args = parser.parse_args()

27
    # Create recursively the results dir tree
28
29
    pathlib.Path(args.results_dir).mkdir(parents=True, exist_ok=True)

30
31
32
33
34
    """
    Use specified list of experiments ids if availabe.
    Otherwise, list all existing experiment ids from
    the specified models directory.
    """
35
36
37
38
    experiments_ids = [str(experiment_id) for experiment_id in args.experiment_ids] \
        if args.experiment_ids is not None \
        else os.listdir(args.models_dir)

39
40
41
42
    """
    Raise an error if there's no experiments ids found both
    in parameter or in models directory.
    """
43
44
45
    if experiments_ids is None or len(experiments_ids) == 0:
        raise ValueError("No experiment id was found or specified.")

46
    # Compute the plots for each experiment id
47
    for experiment_id in experiments_ids:
48
49
        experiment_id_path = args.models_dir + os.sep + experiment_id # models/{experiment_id}
        # Create recursively the tree results/{experiment_id}
50
        pathlib.Path(args.results_dir + os.sep + experiment_id).mkdir(parents=True, exist_ok=True)
51
        experiment_seed_root_path = experiment_id_path + os.sep + 'seeds' # models/{experiment_id}/seeds
52

53
54
55
56
57
        """
        Dictionaries to temporarly store the scalar results with the following structure:
        {seed_1: [score_1, ..., score_m], ... seed_n: [score_1, ..., score_k]}
        TODO: to complete to retreive more results
        """
58
59
60
        experiment_train_scores = dict()
        experiment_dev_scores = dict()
        experiment_test_scores = dict()
61
62

        # Used to check if all losses were computed using the same metric (it should be the case)
63
64
        experiment_score_metrics = list()

65
        # For each seed results stored in models/{experiment_id}/seeds
66
        for seed in os.listdir(experiment_seed_root_path):
67
68
69
70
            experiment_seed_path = experiment_seed_root_path + os.sep + seed # models/{experiment_id}/seeds/{seed}
            dataset_parameters = DatasetParameters.load(experiment_seed_path, experiment_id) # Load the dataset parameters of this experiment, with this specific seed
            dataset = DatasetLoader.load(dataset_parameters) # Load the dataset using the previously loaded dataset parameters
            extracted_forest_size_root_path = experiment_seed_path + os.sep + 'extracted_forest_size' # models/{experiment_id}/seeds/{seed}/extracted_forest_size
71

72
            # {{seed}:[]}
73
74
75
76
            experiment_train_scores[seed] = list()
            experiment_dev_scores[seed] = list()
            experiment_test_scores[seed] = list()

77
            # List the forest sizes in models/{experiment_id}/seeds/{seed}/extracted_forest_size
78
79
            extracted_forest_sizes = os.listdir(extracted_forest_size_root_path)
            for extracted_forest_size in extracted_forest_sizes:
80
                # models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}
81
                extracted_forest_size_path = extracted_forest_size_root_path + os.sep + extracted_forest_size
82
                # Load models/{experiment_id}/seeds/{seed}/extracted_forest_size/{extracted_forest_size}/model_raw_results.pickle file
83
                model_raw_results = ModelRawResults.load(extracted_forest_size_path)
84
                # Load [...]/model_parameters.json file and build the model using these parameters and the weights and forest from model_raw_results.pickle
85
                model = ModelFactory.load(dataset.task, extracted_forest_size_path, experiment_id, model_raw_results)
86
                # Save temporarly some raw results (TODO: to complete to retreive more results)
87
88
89
90
91
92
93
94
                experiment_train_scores[seed].append(model_raw_results.train_score)
                experiment_dev_scores[seed].append(model_raw_results.dev_score)
                experiment_test_scores[seed].append(model_raw_results.test_score)
                experiment_score_metrics.append(model_raw_results.score_metric)

        if len(set(experiment_score_metrics)) > 1:
            raise ValueError("The metrics used to compute the dev score aren't the same everytime")

95
96
97
98
99
100
        """
        Example of plot that just plots the losses computed
        on the train, dev and test subsets using a trained
        model, with the CI, and depending on the extracted
        forest size.
        """
101
102
103
104
105
106
107
108
109
        Plotter.plot_losses(
            file_path=args.results_dir + os.sep + experiment_id + os.sep + 'losses.png',
            all_experiment_scores=[experiment_train_scores, experiment_dev_scores, experiment_test_scores],
            x_value=extracted_forest_sizes,
            xlabel='Number of trees extracted',
            ylabel=experiment_score_metrics[0],
            all_labels=['train', 'dev', 'test'],
            title='Loss values of the trained model'
        )