From acc0dd2358831f698dd26610710f18559aa74a70 Mon Sep 17 00:00:00 2001
From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr>
Date: Wed, 22 Jun 2022 07:37:52 -0400
Subject: [PATCH] Cuisine

---
 summit/multiview_platform/exec_classif.py     |  6 ++--
 .../metrics/specificity_score.py              | 23 +++++++++++++
 .../monoview/exec_classif_mono_view.py        |  8 +++++
 .../monoview_classifiers/imbalance_bagging.py | 12 ++++++-
 .../monoview_classifiers/samba.py             | 15 ++++++---
 .../monoview_classifiers/scm_bagging_mincq.py |  6 ++--
 .../result_analysis/feature_importances.py    |  3 +-
 summit/multiview_platform/utils/base.py       | 32 +++++++++++--------
 summit/multiview_platform/utils/dataset.py    | 14 +++++---
 9 files changed, 89 insertions(+), 30 deletions(-)
 create mode 100644 summit/multiview_platform/metrics/specificity_score.py

diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py
index f742a7fd..11697f4b 100644
--- a/summit/multiview_platform/exec_classif.py
+++ b/summit/multiview_platform/exec_classif.py
@@ -640,9 +640,9 @@ def exec_classif(arguments):  # pragma: no cover
         k_folds = execution.gen_k_folds(stats_iter, args["nb_folds"],
                                         stats_iter_random_states)
 
-        dataset_files = dataset.init_multiple_datasets(args["pathf"],
-                                                       args["name"],
-                                                       nb_cores)
+        # dataset_files = dataset.init_multiple_datasets(args["pathf"],
+        #                                                args["name"],
+        #                                                nb_cores)
 
         views, views_indices, all_views = execution.init_views(dataset_var,
                                                                args[
diff --git a/summit/multiview_platform/metrics/specificity_score.py b/summit/multiview_platform/metrics/specificity_score.py
new file mode 100644
index 00000000..d9f6f585
--- /dev/null
+++ b/summit/multiview_platform/metrics/specificity_score.py
@@ -0,0 +1,23 @@
+from sklearn.metrics import make_scorer
+from sklearn.metrics import confusion_matrix as metric
+
+# Author-Info
+__author__ = "Baptiste Bauvin"
+__status__ = "Prototype"  # Production, Development, Prototype
+
+
+def score(y_true, y_pred, **kwargs):
+    score = metric(y_true, y_pred, **kwargs)
+    if score[0,0]+score[0,1] !=0:
+        return score[0,0]/(score[0,0]+score[0,1])
+    else:
+        return 0
+
+
+def get_scorer(**kwargs):
+    return make_scorer(score, greater_is_better=True, **kwargs)
+
+
+def get_config(**kwargs):
+    configString = "Specificity score (higher is better)".format(kwargs)
+    return configString
\ No newline at end of file
diff --git a/summit/multiview_platform/monoview/exec_classif_mono_view.py b/summit/multiview_platform/monoview/exec_classif_mono_view.py
index 33b5a32c..44ef75b2 100644
--- a/summit/multiview_platform/monoview/exec_classif_mono_view.py
+++ b/summit/multiview_platform/monoview/exec_classif_mono_view.py
@@ -120,6 +120,14 @@ def exec_monoview(directory, X, Y, database_name, labels_names,
     test_pred = classifier.predict(X_test)
     pred_duration = time.monotonic() - pred_beg
 
+    #### ROC CURVE ADDITION ###
+    from sklearn.metrics import roc_curve
+    fpr, tpr, _ = roc_curve(y_test, classifier.predict_proba(X_test)[:, 1])
+    np.savetxt(os.path.join(directory, classifier_class_name+"-fpr.npy"), fpr)
+    np.savetxt(os.path.join(directory, classifier_class_name + "-tpr.npy"), tpr)
+    ### END ROC ###
+
+
     # Filling the full prediction in the right order
     full_pred = np.zeros(Y.shape, dtype=int) - 100
     for train_index, index in enumerate(classification_indices[0]):
diff --git a/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py b/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py
index c4340420..9dfa2e26 100644
--- a/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py
+++ b/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py
@@ -1,5 +1,6 @@
 from imblearn.ensemble import BalancedBaggingClassifier
-from sklearn.tree import DecisionTreeClassifier
+import numpy as np
+
 
 from ..monoview.monoview_utils import BaseMonoviewClassifier
 from ..utils.base import base_boosting_estimators
@@ -27,5 +28,14 @@ class ImbalanceBagging(BaseMonoviewClassifier, BalancedBaggingClassifier):
         self.weird_strings = {"base_estimator": "class_name"}
         self.base_estimator_config = base_estimator_config
 
+    def fit(self, X, y):
+        BalancedBaggingClassifier.fit(self, X, y)
+        self.feature_importances_ = np.zeros(X.shape[1])
+        for estim in self.estimators_:
+            if hasattr(estim['classifier'], 'feature_importances_'):
+                self.feature_importances_ += estim['classifier'].feature_importances_
+        self.feature_importances_ /= np.sum(self.feature_importances_)
+        return self
+
 
 
diff --git a/summit/multiview_platform/monoview_classifiers/samba.py b/summit/multiview_platform/monoview_classifiers/samba.py
index 97094c6d..a69de937 100644
--- a/summit/multiview_platform/monoview_classifiers/samba.py
+++ b/summit/multiview_platform/monoview_classifiers/samba.py
@@ -33,6 +33,7 @@ class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier):
                  pred_train=False,
                  forced_diversity=False,
                  normalize_dists=False,
+                 class_weight="balanced",
                  **kwargs):
         """
 
@@ -54,18 +55,22 @@ class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier):
                  normalizer=normalizer,
                  forced_diversity=forced_diversity,
                  b=b, a=a, pred_train=pred_train,
-                                       normalize_dists=normalize_dists)
-        self.param_names = ["n_estimators", "relevance", "distance",
+                                       normalize_dists=normalize_dists,
+                                       class_weight=class_weight)
+        self.param_names = ["n_estimators",
+                            "relevance",
+                            "distance",
                             "train_weighting", "b", "pred_train", "normalizer",
-                            "normalize_dists", "a"]
+                            "normalize_dists", "a", "class_weight"]
         self.distribs = [CustomRandint(low=1, high=70),
                          [ExpRelevance()],
-                         [EuclidianDist(), PolarDist(), ExpEuclidianDist()],
+                         [EuclidianDist(), PolarDist(), ExpEuclidianDist(), Jaccard()],
                          [ExpTrainWeighting()],
                          CustomUniform(0.1, 6,),
                          [True, False],
                          [RobustScaler()],
-                         [True], CustomRandint(0, 10, 'e-')]
+                         [True], CustomRandint(0, 10, 'e-'),
+                         ["balanced", None]]
         self.classed_params = []
         self.weird_strings = {}
 
diff --git a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py b/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py
index b4b38d60..32892dd1 100644
--- a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py
+++ b/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py
@@ -1,4 +1,5 @@
-from scm_bagging.scm_bagging_classifier import ScmBaggingClassifier
+from randomscm.randomscm import RandomScmClassifier
+
 
 
 from ..monoview.monoview_utils import BaseMonoviewClassifier
@@ -16,7 +17,8 @@ from six import iteritems
 MAX_INT = np.iinfo(np.int32).max
 
 
-class ScmBaggingMinCq(ScmBaggingClassifier, BaseMonoviewClassifier):
+class ScmBaggingMinCq(RandomScmClassifier, BaseMonoviewClassifier):
+
     """A Bagging classifier. for SetCoveringMachineClassifier()
     The base estimators are built on subsets of both samples
     and features.
diff --git a/summit/multiview_platform/result_analysis/feature_importances.py b/summit/multiview_platform/result_analysis/feature_importances.py
index 36c0eb35..042e4c0d 100644
--- a/summit/multiview_platform/result_analysis/feature_importances.py
+++ b/summit/multiview_platform/result_analysis/feature_importances.py
@@ -140,7 +140,8 @@ def plot_feature_relevance(file_name, feature_importance,
             for score in score_df.columns:
                 if len(score.split("-"))>1:
                     algo, view = score.split("-")
-                    feature_importance[algo].loc[[ind for ind in feature_importance.index if ind.startswith(view)]]*=score_df[score]['test']
+                    list_ind = [ind for ind in feature_importance.index if ind.startswith(view)]
+                    feature_importance[algo].loc[list_ind]*=2*(score_df[score]['test']-0.5)
                 else:
                     feature_importance[score] *= score_df[score]['test']
     file_name+="_relevance"
diff --git a/summit/multiview_platform/utils/base.py b/summit/multiview_platform/utils/base.py
index 67df47a6..a32afbd0 100644
--- a/summit/multiview_platform/utils/base.py
+++ b/summit/multiview_platform/utils/base.py
@@ -253,21 +253,27 @@ class ResultAnalyser():
             metric_module = getattr(metrics, metric)
         else:
             metric_module = getattr(metrics, metric[:-1])
+
         class_train_scores = []
         class_test_scores = []
-        for label_value in np.unique(self.labels):
-            train_sample_indices = self.train_indices[
-                np.where(self.labels[self.train_indices] == label_value)[0]]
-            test_sample_indices = self.test_indices[
-                np.where(self.labels[self.test_indices] == label_value)[0]]
-            class_train_scores.append(
-                metric_module.score(y_true=self.labels[train_sample_indices],
-                                    y_pred=self.pred[train_sample_indices],
-                                    **metric_kwargs))
-            class_test_scores.append(
-                metric_module.score(y_true=self.labels[test_sample_indices],
-                                    y_pred=self.pred[test_sample_indices],
-                                    **metric_kwargs))
+        if metric not in ["roc_auc_score", "specificity_score"]:
+            for label_value in np.unique(self.labels):
+                train_sample_indices = self.train_indices[
+                    np.where(self.labels[self.train_indices] == label_value)[0]]
+                test_sample_indices = self.test_indices[
+                    np.where(self.labels[self.test_indices] == label_value)[0]]
+                class_train_scores.append(
+                    metric_module.score(y_true=self.labels[train_sample_indices],
+                                        y_pred=self.pred[train_sample_indices],
+                                        **metric_kwargs))
+                class_test_scores.append(
+                    metric_module.score(y_true=self.labels[test_sample_indices],
+                                        y_pred=self.pred[test_sample_indices],
+                                        **metric_kwargs))
+        else:
+            for _ in np.unique(self.labels):
+                class_train_scores.append(0)
+                class_test_scores.append(0)
         train_score = metric_module.score(
             y_true=self.labels[self.train_indices],
             y_pred=self.pred[self.train_indices],
diff --git a/summit/multiview_platform/utils/dataset.py b/summit/multiview_platform/utils/dataset.py
index 15175976..600a0661 100644
--- a/summit/multiview_platform/utils/dataset.py
+++ b/summit/multiview_platform/utils/dataset.py
@@ -458,11 +458,11 @@ class HDF5Dataset(Dataset):
         for view_index in range(self.nb_view):
             if "feature_ids-View{}".format(view_index) in self.dataset["Metadata"].keys():
                 self.feature_ids[view_index] = [feature_id.decode()
-                                   if not is_just_number(feature_id.decode())
-                                   else "ID_" + feature_id.decode()
-                                       for feature_id in self.dataset["Metadata"]["feature_ids-View{}".format(view_index)]]
+                                                if not is_just_number(feature_id.decode())
+                                                else "ID_" + feature_id.decode()
+                                                for feature_id in self.dataset["Metadata"]["feature_ids-View{}".format(view_index)]]
             else:
-               self.gen_feat_id(view_index)
+                self.gen_feat_id(view_index)
 
     def get_nb_samples(self):
         """
@@ -503,7 +503,7 @@ class HDF5Dataset(Dataset):
             seleted labels' names
         """
         selected_labels = self.get_labels(sample_indices)
-        if decode:
+        if type(self.dataset["Labels"].attrs["names"][0]) == bytes:
             return [label_name.decode("utf-8")
                     for label, label_name in
                     enumerate(self.dataset["Labels"].attrs["names"])
@@ -619,10 +619,14 @@ class HDF5Dataset(Dataset):
         view_names = self.init_view_names(view_names)
         new_dataset_file["Metadata"].attrs["nbView"] = len(view_names)
         for new_index, view_name in enumerate(view_names):
+            del new_dataset_file["Metadata"]["feature_ids-View{}".format(new_index)]
+            new_dataset_file["Metadata"]["feature_ids-View{}".format(new_index)] = new_dataset_file["Metadata"]["feature_ids-View{}".format(self.view_dict[view_name])]
+            del new_dataset_file["Metadata"]["feature_ids-View{}".format(self.view_dict[view_name])]
             self.copy_view(target_dataset=new_dataset_file,
                            source_view_name=view_name,
                            target_view_index=new_index,
                            sample_indices=sample_indices)
+
         new_dataset_file.close()
         self.update_hdf5_dataset(dataset_file_path)
 
-- 
GitLab