Change for multiclass

1379c412 · Léo Bouscarrat · c80ddd61 · 1379c412 · 1379c412 · 1379c412
Commit 1379c412 authored 5 years ago by Léo Bouscarrat
--- a/code/bolsonaro/models/omp_forest.py
+++ b/code/bolsonaro/models/omp_forest.py
@@ -24,6 +24,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
        return self._base_forest_estimator.score(X, y)

    def _base_estimator_predictions(self, X):
+        # We need to use predict_proba to get the probabilities of each class
        return np.array([tree.predict(X) for tree in self._base_forest_estimator.estimators_]).T

    @property
@@ -66,7 +67,7 @@ class OmpForest(BaseEstimator, metaclass=ABCMeta):
        if normalize_weights:
            # we can normalize weights (by their sum) so that they sum to 1
            # and they can be interpreted as impact percentages for interpretability.
-            # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef)
+            # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) --> I don't see why

            # question: je comprend pas le truc avec nonszero?
            # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))

--- a/code/bolsonaro/models/omp_forest_classifier.py
+++ b/code/bolsonaro/models/omp_forest_classifier.py
@@ -60,7 +60,7 @@ class OmpForestMulticlassClassifier(OmpForest):
        for class_label in possible_classes:
            atoms_binary = binarize_class_data(atoms, class_label, inplace=False)
            objective_binary = binarize_class_data(objective, class_label, inplace=False)
-            # todo peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
+            # TODO: peut etre considérer que la taille de forêt est globale et donc seulement une fraction est disponible pour chaque OMP...
            omp_class = OrthogonalMatchingPursuit(
                n_nonzero_coefs=self.models_parameters.extracted_forest_size,
                fit_intercept=True, normalize=False)
@@ -69,7 +69,9 @@ class OmpForestMulticlassClassifier(OmpForest):
        return self._dct_class_omp

    def predict(self, X):
-        forest_predictions = self._base_estimator_predictions(X)
+        '''forest_predictions = self._base_estimator_predictions(X)
+
+        print(forest_predictions.shape)

        if self._models_parameters.normalize_D:
            forest_predictions /= self._forest_norms
@@ -79,9 +81,26 @@ class OmpForestMulticlassClassifier(OmpForest):
        for class_label, omp_class in self._dct_class_omp.items():
            label_names.append(class_label)
            atoms_binary = binarize_class_data(forest_predictions, class_label, inplace=False)
+            print(atoms_binary.shape)
            preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))

-        # todo verifier que ce n'est pas bugué ici
+        # TODO: verifier que ce n'est pas bugué ici
+
+        preds = np.array(preds).T'''
+
+        forest_predictions = np.array([tree.predict_proba(X) for tree in self._base_forest_estimator.estimators_]).T
+
+        if self._models_parameters.normalize_D:
+            forest_predictions /= self._forest_norms
+
+        label_names = []
+        preds = []
+        num_class = 0
+        for class_label, omp_class in self._dct_class_omp.items():
+            label_names.append(class_label)
+            atoms_binary = (forest_predictions[num_class] - 0.5) * 2 # centré réduit de 0/1 à -1/1
+            preds.append(self._make_omp_weighted_prediction(atoms_binary, omp_class, self._models_parameters.normalize_weights))
+            num_class += 1

        preds = np.array(preds).T
        max_preds = np.argmax(preds, axis=1)
@@ -97,6 +116,27 @@ class OmpForestMulticlassClassifier(OmpForest):

        return evaluation

+    @staticmethod
+    def _make_omp_weighted_prediction(base_predictions, omp_obj, normalize_weights=False):
+        if normalize_weights:
+            # we can normalize weights (by their sum) so that they sum to 1
+            # and they can be interpreted as impact percentages for interpretability.
+            # this necessits to remove the (-) in weights, e.g. move it to the predictions (use unsigned_coef) --> I don't see why
+
+            # question: je comprend pas le truc avec nonszero?
+            # predictions = self._omp.predict(forest_predictions) * (1 / (np.sum(self._omp.coef_) / len(np.nonzero(self._omp.coef_))))
+            coef_signs = np.sign(omp_obj.coef_)[np.newaxis, :]  # add axis to make sure it will be broadcasted line-wise (there might be a confusion when forest_prediction is square)
+            unsigned_coef = (coef_signs * omp_obj.coef_).squeeze()
+            intercept = omp_obj.intercept_
+
+            adjusted_forest_predictions = base_predictions * coef_signs
+            predictions = adjusted_forest_predictions.dot(unsigned_coef) + intercept
+
+        else:
+            predictions = omp_obj.predict(base_predictions)
+
+        return predictions
+

 if __name__ == "__main__":
    forest = RandomForestClassifier(n_estimators=10)

--- a/code/bolsonaro/utils.py
+++ b/code/bolsonaro/utils.py
@@ -60,7 +60,6 @@ def binarize_class_data(data, class_pos, inplace=True):
    """
    if not inplace:
        data = deepcopy(data)
-
    position_class_labels = (data == class_pos)
    data[~(position_class_labels)] = -1
    data[(position_class_labels)] = +1

--- a/experiments/boston/stage1/none_with_params.json
+++ b/experiments/boston/stage1/none_with_params.json
@@ -6,18 +6,15 @@
    "normalize_D": false,
    "dataset_normalizer": "standard",
    "forest_size": null,
-    "extracted_forest_size_samples": 5,
-    "extracted_forest_size_stop": 0.05,
+    "extracted_forest_size_samples": 10,
+    "extracted_forest_size_stop": 0.4,
    "models_dir": "models/boston/stage1",
    "dev_size": 0.2,
    "test_size": 0.2,
    "random_seed_number": 1,
    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
+        2078,
+        90
    ],
    "subsets_used": "train,dev",
    "normalize_weights": false,
@@ -30,10 +27,15 @@
    "job_number": -1,
    "extraction_strategy": "none",
    "extracted_forest_size": [
-        8,
-        17,
-        25,
-        33,
-        42
+        36,
+        73,
+        109,
+        145,
+        182,
+        218,
+        255,
+        291,
+        327,
+        364
    ]
 }
\ No newline at end of file
--- a/experiments/iris/stage1/none_with_params.json
+++ b/experiments/iris/stage1/none_with_params.json
@@ -13,11 +13,9 @@
    "test_size": 0.2,
    "random_seed_number": 1,
    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
+        58,
+        43535,
+        234234
    ],
    "subsets_used": "train,dev",
    "normalize_weights": false,

--- a/experiments/iris/stage1/none_wo_params.json
+++ b/experiments/iris/stage1/none_wo_params.json
@@ -13,11 +13,9 @@
    "test_size": 0.2,
    "random_seed_number": 1,
    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
+        58,
+        43535,
+        234234
    ],
    "subsets_used": "train,dev",
    "normalize_weights": false,

--- a/experiments/iris/stage1/omp_with_params.json
+++ b/experiments/iris/stage1/omp_with_params.json
@@ -13,11 +13,9 @@
    "test_size": 0.2,
    "random_seed_number": 1,
    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
+        58,
+        43535,
+        234234
    ],
    "subsets_used": "train,dev",
    "normalize_weights": false,

--- a/experiments/iris/stage1/omp_wo_params.json
+++ b/experiments/iris/stage1/omp_wo_params.json
@@ -13,11 +13,9 @@
    "test_size": 0.2,
    "random_seed_number": 1,
    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
+        58,
+        43535,
+        234234
    ],
    "subsets_used": "train,dev",
    "normalize_weights": false,

--- a/experiments/iris/stage1/random_with_params.json
+++ b/experiments/iris/stage1/random_with_params.json
@@ -13,11 +13,9 @@
    "test_size": 0.2,
    "random_seed_number": 1,
    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
+        58,
+        43535,
+        234234
    ],
    "subsets_used": "train,dev",
    "normalize_weights": false,

--- a/experiments/iris/stage1/random_wo_params.json
+++ b/experiments/iris/stage1/random_wo_params.json
@@ -13,11 +13,9 @@
    "test_size": 0.2,
    "random_seed_number": 1,
    "seeds": [
-        1,
-        2,
-        3,
-        4,
-        5
+        58,
+        43535,
+        234234
    ],
    "subsets_used": "train,dev",
    "normalize_weights": false,

--- a/results/iris/stage1/losses.png
+++ b/results/iris/stage1/losses.png
--- a/results/iris/stage2/losses.png
+++ b/results/iris/stage2/losses.png
--- a/results/iris/stage3/losses.png
+++ b/results/iris/stage3/losses.png
--- a/results/iris/stage4/losses.png
+++ b/results/iris/stage4/losses.png
--- a/scripts/run_compute_results_fix.sh
+++ b/scripts/run_compute_results_fix.sh
 python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=california_housing --models_dir=models/california_housing/stage3
 python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=boston --models_dir=models/boston/stage3
-python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=iris --models_dir=models/iris/stage3
+python code/compute_results.py --stage=1 --experiment_ids 1 2 3 4 5 6 --dataset_name=iris --models_dir=models/iris/stage1
 python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=diabetes --models_dir=models/diabetes/stage3
 python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=digits --models_dir=models/digits/stage3
 python code/compute_results.py --stage=3 --experiment_ids 1 2 3 --dataset_name=linnerud --models_dir=models/linnerud/stage3

--- a/scripts/run_stage1_experiments_fix.sh
+++ b/scripts/run_stage1_experiments_fix.sh
 #!/bin/bash
-core_number=10
+core_number=5
 walltime=1:00
-seeds='1 2 3'
+seeds='58 43535 234234'

-for dataset in diamonds
+for dataset in iris
 do
-    oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=10 --experiment_id=1 --models_dir=models/$dataset/stage1"
-    oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=10 --experiment_id=2 --models_dir=models/$dataset/stage1"
-    oarsub -p "(gpu is null)" -l /core=$core_number,walltime=5:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds 5 --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=10 --experiment_id=3 --models_dir=models/$dataset/stage1"
-    oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=10 --experiment_id=4 --models_dir=models/$dataset/stage1"
-    oarsub -p "(gpu is null)" -l /core=$core_number,walltime=1:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=10 --experiment_id=5 --models_dir=models/$dataset/stage1"
-    oarsub -p "(gpu is null)" -l /core=$core_number,walltime=5:00 "conda activate test_env && python code/train.py --dataset_name=$dataset --seeds $seeds --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --extracted_forest_size_stop=0.40 --extracted_forest_size_samples=10 --experiment_id=6 --models_dir=models/$dataset/stage1"
+    python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --save_experiment_configuration 1 none_with_params --extracted_forest_size_stop=0.05 --extracted_forest_size_samples=5 --experiment_id=1 --models_dir=models/$dataset/stage1
+    python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --save_experiment_configuration 1 random_with_params --extracted_forest_size_stop=0.05 --extracted_forest_size_samples=5 --experiment_id=2 --models_dir=models/$dataset/stage1
+    python code/train.py --dataset_name=$dataset --seeds $seeds --save_experiment_configuration 1 omp_with_params --extracted_forest_size_stop=0.05 --extracted_forest_size_samples=5 --experiment_id=3 --models_dir=models/$dataset/stage1
+    python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=none --skip_best_hyperparams --save_experiment_configuration 1 none_wo_params --extracted_forest_size_stop=0.05 --extracted_forest_size_samples=5 --experiment_id=4 --models_dir=models/$dataset/stage1
+    python code/train.py --dataset_name=$dataset --seeds $seeds --extraction_strategy=random --skip_best_hyperparams --save_experiment_configuration 1 random_wo_params --extracted_forest_size_stop=0.05 --extracted_forest_size_samples=5 --experiment_id=5 --models_dir=models/$dataset/stage1
+    python code/train.py --dataset_name=$dataset --seeds $seeds --skip_best_hyperparams --save_experiment_configuration 1 omp_wo_params --extracted_forest_size_stop=0.05 --extracted_forest_size_samples=5 --experiment_id=6 --models_dir=models/$dataset/stage1
 done