From acc0dd2358831f698dd26610710f18559aa74a70 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Wed, 22 Jun 2022 07:37:52 -0400 Subject: [PATCH] Cuisine --- summit/multiview_platform/exec_classif.py | 6 ++-- .../metrics/specificity_score.py | 23 +++++++++++++ .../monoview/exec_classif_mono_view.py | 8 +++++ .../monoview_classifiers/imbalance_bagging.py | 12 ++++++- .../monoview_classifiers/samba.py | 15 ++++++--- .../monoview_classifiers/scm_bagging_mincq.py | 6 ++-- .../result_analysis/feature_importances.py | 3 +- summit/multiview_platform/utils/base.py | 32 +++++++++++-------- summit/multiview_platform/utils/dataset.py | 14 +++++--- 9 files changed, 89 insertions(+), 30 deletions(-) create mode 100644 summit/multiview_platform/metrics/specificity_score.py diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py index f742a7fd..11697f4b 100644 --- a/summit/multiview_platform/exec_classif.py +++ b/summit/multiview_platform/exec_classif.py @@ -640,9 +640,9 @@ def exec_classif(arguments): # pragma: no cover k_folds = execution.gen_k_folds(stats_iter, args["nb_folds"], stats_iter_random_states) - dataset_files = dataset.init_multiple_datasets(args["pathf"], - args["name"], - nb_cores) + # dataset_files = dataset.init_multiple_datasets(args["pathf"], + # args["name"], + # nb_cores) views, views_indices, all_views = execution.init_views(dataset_var, args[ diff --git a/summit/multiview_platform/metrics/specificity_score.py b/summit/multiview_platform/metrics/specificity_score.py new file mode 100644 index 00000000..d9f6f585 --- /dev/null +++ b/summit/multiview_platform/metrics/specificity_score.py @@ -0,0 +1,23 @@ +from sklearn.metrics import make_scorer +from sklearn.metrics import confusion_matrix as metric + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +def score(y_true, y_pred, **kwargs): + score = metric(y_true, y_pred, **kwargs) + if score[0,0]+score[0,1] !=0: + return score[0,0]/(score[0,0]+score[0,1]) + else: + return 0 + + +def get_scorer(**kwargs): + return make_scorer(score, greater_is_better=True, **kwargs) + + +def get_config(**kwargs): + configString = "Specificity score (higher is better)".format(kwargs) + return configString \ No newline at end of file diff --git a/summit/multiview_platform/monoview/exec_classif_mono_view.py b/summit/multiview_platform/monoview/exec_classif_mono_view.py index 33b5a32c..44ef75b2 100644 --- a/summit/multiview_platform/monoview/exec_classif_mono_view.py +++ b/summit/multiview_platform/monoview/exec_classif_mono_view.py @@ -120,6 +120,14 @@ def exec_monoview(directory, X, Y, database_name, labels_names, test_pred = classifier.predict(X_test) pred_duration = time.monotonic() - pred_beg + #### ROC CURVE ADDITION ### + from sklearn.metrics import roc_curve + fpr, tpr, _ = roc_curve(y_test, classifier.predict_proba(X_test)[:, 1]) + np.savetxt(os.path.join(directory, classifier_class_name+"-fpr.npy"), fpr) + np.savetxt(os.path.join(directory, classifier_class_name + "-tpr.npy"), tpr) + ### END ROC ### + + # Filling the full prediction in the right order full_pred = np.zeros(Y.shape, dtype=int) - 100 for train_index, index in enumerate(classification_indices[0]): diff --git a/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py b/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py index c4340420..9dfa2e26 100644 --- a/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py +++ b/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py @@ -1,5 +1,6 @@ from imblearn.ensemble import BalancedBaggingClassifier -from sklearn.tree import DecisionTreeClassifier +import numpy as np + from ..monoview.monoview_utils import BaseMonoviewClassifier from ..utils.base import base_boosting_estimators @@ -27,5 +28,14 @@ class ImbalanceBagging(BaseMonoviewClassifier, BalancedBaggingClassifier): self.weird_strings = {"base_estimator": "class_name"} self.base_estimator_config = base_estimator_config + def fit(self, X, y): + BalancedBaggingClassifier.fit(self, X, y) + self.feature_importances_ = np.zeros(X.shape[1]) + for estim in self.estimators_: + if hasattr(estim['classifier'], 'feature_importances_'): + self.feature_importances_ += estim['classifier'].feature_importances_ + self.feature_importances_ /= np.sum(self.feature_importances_) + return self + diff --git a/summit/multiview_platform/monoview_classifiers/samba.py b/summit/multiview_platform/monoview_classifiers/samba.py index 97094c6d..a69de937 100644 --- a/summit/multiview_platform/monoview_classifiers/samba.py +++ b/summit/multiview_platform/monoview_classifiers/samba.py @@ -33,6 +33,7 @@ class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier): pred_train=False, forced_diversity=False, normalize_dists=False, + class_weight="balanced", **kwargs): """ @@ -54,18 +55,22 @@ class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier): normalizer=normalizer, forced_diversity=forced_diversity, b=b, a=a, pred_train=pred_train, - normalize_dists=normalize_dists) - self.param_names = ["n_estimators", "relevance", "distance", + normalize_dists=normalize_dists, + class_weight=class_weight) + self.param_names = ["n_estimators", + "relevance", + "distance", "train_weighting", "b", "pred_train", "normalizer", - "normalize_dists", "a"] + "normalize_dists", "a", "class_weight"] self.distribs = [CustomRandint(low=1, high=70), [ExpRelevance()], - [EuclidianDist(), PolarDist(), ExpEuclidianDist()], + [EuclidianDist(), PolarDist(), ExpEuclidianDist(), Jaccard()], [ExpTrainWeighting()], CustomUniform(0.1, 6,), [True, False], [RobustScaler()], - [True], CustomRandint(0, 10, 'e-')] + [True], CustomRandint(0, 10, 'e-'), + ["balanced", None]] self.classed_params = [] self.weird_strings = {} diff --git a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py b/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py index b4b38d60..32892dd1 100644 --- a/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py +++ b/summit/multiview_platform/monoview_classifiers/scm_bagging_mincq.py @@ -1,4 +1,5 @@ -from scm_bagging.scm_bagging_classifier import ScmBaggingClassifier +from randomscm.randomscm import RandomScmClassifier + from ..monoview.monoview_utils import BaseMonoviewClassifier @@ -16,7 +17,8 @@ from six import iteritems MAX_INT = np.iinfo(np.int32).max -class ScmBaggingMinCq(ScmBaggingClassifier, BaseMonoviewClassifier): +class ScmBaggingMinCq(RandomScmClassifier, BaseMonoviewClassifier): + """A Bagging classifier. for SetCoveringMachineClassifier() The base estimators are built on subsets of both samples and features. diff --git a/summit/multiview_platform/result_analysis/feature_importances.py b/summit/multiview_platform/result_analysis/feature_importances.py index 36c0eb35..042e4c0d 100644 --- a/summit/multiview_platform/result_analysis/feature_importances.py +++ b/summit/multiview_platform/result_analysis/feature_importances.py @@ -140,7 +140,8 @@ def plot_feature_relevance(file_name, feature_importance, for score in score_df.columns: if len(score.split("-"))>1: algo, view = score.split("-") - feature_importance[algo].loc[[ind for ind in feature_importance.index if ind.startswith(view)]]*=score_df[score]['test'] + list_ind = [ind for ind in feature_importance.index if ind.startswith(view)] + feature_importance[algo].loc[list_ind]*=2*(score_df[score]['test']-0.5) else: feature_importance[score] *= score_df[score]['test'] file_name+="_relevance" diff --git a/summit/multiview_platform/utils/base.py b/summit/multiview_platform/utils/base.py index 67df47a6..a32afbd0 100644 --- a/summit/multiview_platform/utils/base.py +++ b/summit/multiview_platform/utils/base.py @@ -253,21 +253,27 @@ class ResultAnalyser(): metric_module = getattr(metrics, metric) else: metric_module = getattr(metrics, metric[:-1]) + class_train_scores = [] class_test_scores = [] - for label_value in np.unique(self.labels): - train_sample_indices = self.train_indices[ - np.where(self.labels[self.train_indices] == label_value)[0]] - test_sample_indices = self.test_indices[ - np.where(self.labels[self.test_indices] == label_value)[0]] - class_train_scores.append( - metric_module.score(y_true=self.labels[train_sample_indices], - y_pred=self.pred[train_sample_indices], - **metric_kwargs)) - class_test_scores.append( - metric_module.score(y_true=self.labels[test_sample_indices], - y_pred=self.pred[test_sample_indices], - **metric_kwargs)) + if metric not in ["roc_auc_score", "specificity_score"]: + for label_value in np.unique(self.labels): + train_sample_indices = self.train_indices[ + np.where(self.labels[self.train_indices] == label_value)[0]] + test_sample_indices = self.test_indices[ + np.where(self.labels[self.test_indices] == label_value)[0]] + class_train_scores.append( + metric_module.score(y_true=self.labels[train_sample_indices], + y_pred=self.pred[train_sample_indices], + **metric_kwargs)) + class_test_scores.append( + metric_module.score(y_true=self.labels[test_sample_indices], + y_pred=self.pred[test_sample_indices], + **metric_kwargs)) + else: + for _ in np.unique(self.labels): + class_train_scores.append(0) + class_test_scores.append(0) train_score = metric_module.score( y_true=self.labels[self.train_indices], y_pred=self.pred[self.train_indices], diff --git a/summit/multiview_platform/utils/dataset.py b/summit/multiview_platform/utils/dataset.py index 15175976..600a0661 100644 --- a/summit/multiview_platform/utils/dataset.py +++ b/summit/multiview_platform/utils/dataset.py @@ -458,11 +458,11 @@ class HDF5Dataset(Dataset): for view_index in range(self.nb_view): if "feature_ids-View{}".format(view_index) in self.dataset["Metadata"].keys(): self.feature_ids[view_index] = [feature_id.decode() - if not is_just_number(feature_id.decode()) - else "ID_" + feature_id.decode() - for feature_id in self.dataset["Metadata"]["feature_ids-View{}".format(view_index)]] + if not is_just_number(feature_id.decode()) + else "ID_" + feature_id.decode() + for feature_id in self.dataset["Metadata"]["feature_ids-View{}".format(view_index)]] else: - self.gen_feat_id(view_index) + self.gen_feat_id(view_index) def get_nb_samples(self): """ @@ -503,7 +503,7 @@ class HDF5Dataset(Dataset): seleted labels' names """ selected_labels = self.get_labels(sample_indices) - if decode: + if type(self.dataset["Labels"].attrs["names"][0]) == bytes: return [label_name.decode("utf-8") for label, label_name in enumerate(self.dataset["Labels"].attrs["names"]) @@ -619,10 +619,14 @@ class HDF5Dataset(Dataset): view_names = self.init_view_names(view_names) new_dataset_file["Metadata"].attrs["nbView"] = len(view_names) for new_index, view_name in enumerate(view_names): + del new_dataset_file["Metadata"]["feature_ids-View{}".format(new_index)] + new_dataset_file["Metadata"]["feature_ids-View{}".format(new_index)] = new_dataset_file["Metadata"]["feature_ids-View{}".format(self.view_dict[view_name])] + del new_dataset_file["Metadata"]["feature_ids-View{}".format(self.view_dict[view_name])] self.copy_view(target_dataset=new_dataset_file, source_view_name=view_name, target_view_index=new_index, sample_indices=sample_indices) + new_dataset_file.close() self.update_hdf5_dataset(dataset_file_path) -- GitLab