From e5c042815ab517750d438a66798abe2e268e1e5e Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Thu, 28 Oct 2021 07:57:48 -0400 Subject: [PATCH] added some files --- config_files/config_cuisine.yml | 88 ++++++++++++++----- config_files/config_private_algos.yml | 26 +++--- setup.py | 1 + summit/multiview_platform/exec_classif.py | 6 +- .../metrics/balanced_accuracy.py | 41 +++++++++ .../monoview_classifiers/adaboost.py | 2 +- .../monoview_classifiers/adaboost_pregen.py | 1 + .../monoview_classifiers/samba.py | 70 +++++++++++++++ .../early_fusion_samba.py | 40 +++++++++ .../multiview_classifiers/mumbo.py | 7 ++ 10 files changed, 246 insertions(+), 36 deletions(-) create mode 100644 summit/multiview_platform/metrics/balanced_accuracy.py create mode 100644 summit/multiview_platform/monoview_classifiers/samba.py create mode 100644 summit/multiview_platform/multiview_classifiers/early_fusion_samba.py diff --git a/config_files/config_cuisine.yml b/config_files/config_cuisine.yml index 5a916146..9206cf2f 100644 --- a/config_files/config_cuisine.yml +++ b/config_files/config_cuisine.yml @@ -1,10 +1,10 @@ # The base configuration of the benchmark log: True -name: ["ionosphere", "abalone", "australian", "balance", "bupa", "cylinder", "hepatitis", "pima", "yeast", "zoo"] -label: "comp_1" +name: ['tnbc_mazid'] +label: "" file_type: ".hdf5" views: -pathf: "/home/baptiste/Documents/Datasets/UCI/both/" +pathf: "/home/baptiste/Documents/Datasets/Mazid/" nice: 0 random_state: 42 nb_cores: 1 @@ -13,34 +13,53 @@ debug: True add_noise: False noise_std: 0.0 res_dir: "../results/" -track_tracebacks: False +track_tracebacks: True # All the classification-realted configuration options multiclass_method: "oneVersusOne" -split: 0.50 +split: 0.30 nb_folds: 5 nb_class: 2 classes: -type: ["monoview",] -algos_monoview: ["cb_boost", "self_opt_cb", "adaboost", "cq_boost", "min_cq", "adaboost_pregen", "self_opt_cb_pseudo", "self_opt_cb_root"] -algos_multiview: ["mv_cb_boost","early_fusion_dt", "early_fusion_cb", "early_fusion_rf","mumbo", "early_fusion_svm" ] +type: ["monoview","multiview"] +algos_monoview: ["samba", "scm_bagging", "random_forest", "adaboost", 'scm'] +algos_multiview: ["early_fusion_adaboost", "early_fusion_decision_tree", "early_fusion_random_forest", "early_fusion_samba"] stats_iter: 5 metrics: - accuracy_score: {} + balanced_accuracy: {} f1_score: average: 'micro' -metric_princ: "accuracy_score" -hps_type: "None" + accuracy_score: {} +metric_princ: "balanced_accuracy" +hps_type: "Random" hps_args: - n_iter: 30 - equivalent_draws: True + n_iter: 20 + equivalent_draws: False svm_rbf: C: 0.7 +scm_bagging: + {max_features: 0.908115713423863, max_rules: 9, max_samples: 0.9277949143533335, model_type: conjunction, + n_estimators: 109, p_options: 0.7823433255515356} + +samba: + n_estimators: 22 + +adaboost: + {base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, base_estimator__class_weight: null, + base_estimator__criterion: gini, base_estimator__max_depth: 5, base_estimator__max_features: null, + base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0, + base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, base_estimator__min_samples_split: 2, + base_estimator__min_weight_fraction_leaf: 0.0, base_estimator__random_state: null, + base_estimator__splitter: best, n_estimators: 354} + +svm_linear: + C: 0.3867 + cb_boost: n_stumps: 1 - n_max_iterations: 10 + n_max_iterations: 20 estimators_generator: "Stumps" cq_boost: @@ -50,15 +69,42 @@ cq_boost: min_cq: n_stumps_per_attribute: 1 -adaboost: - n_estimators: 10 +decision_tree: + {criterion: entropy, max_depth: 271, splitter: random} -adaboost_pregen: - n_estimators: 10 - n_stumps: 1 +early_fusion_adaboost: + {base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, base_estimator__class_weight: null, + base_estimator__criterion: gini, base_estimator__max_depth: 5, base_estimator__max_features: null, + base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0, + base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, base_estimator__min_samples_split: 2, + base_estimator__min_weight_fraction_leaf: 0.0, base_estimator__random_state: null, + base_estimator__splitter: best, base_estimator_config: null, n_estimators: 273} -decision_tree: - max_depth: 2 +early_fusion_decision_tree: + {criterion: entropy, max_depth: 293, splitter: random} + +early_fusion_random_forest: + {criterion: gini, max_depth: 8, n_estimators: 46} + +random_forest: + {criterion: gini, max_depth: 8, n_estimators: 32} + +weighted_linear_late_fusion: + classifier_configs: + - decision_tree: {criterion: entropy, max_depth: 112, splitter: random} + - adaboost: {base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, + base_estimator__class_weight: null, base_estimator__criterion: gini, base_estimator__max_depth: 2, + base_estimator__max_features: null, base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0, + base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, + base_estimator__min_samples_split: 2, base_estimator__min_weight_fraction_leaf: 0.0, + base_estimator__random_state: null, base_estimator__splitter: best, n_estimators: 400} + classifiers_names: [decision_tree, adaboost] + nb_cores: 1 + rs: 724 + weights: [0.9636627605010293, 0.3834415188257777] + +scm: + {max_rules: 10, model_type: conjunction, p: 0.8310271995093625} mumbo: base_estimator: diff --git a/config_files/config_private_algos.yml b/config_files/config_private_algos.yml index bc745606..a676749a 100644 --- a/config_files/config_private_algos.yml +++ b/config_files/config_private_algos.yml @@ -1,14 +1,14 @@ # The base configuration of the benchmark log: True -name: ["mnist_0_9_train"] +name: ["multiview_mnist"] label: "_" file_type: ".hdf5" -views: ["NIMST_data", ] -pathf: "/home/baptiste/Documents/Datasets/MNist/" +views: +pathf: "examples/data/" nice: 0 random_state: 43 nb_cores: 1 -full: False +full: True debug: True add_noise: False noise_std: 0.0 @@ -19,16 +19,20 @@ track_tracebacks: False multiclass_method: "oneVersusOne" split: 0.96 nb_folds: 5 -nb_class: 2 +nb_class: classes: -type: ["monoview",] -algos_monoview: ["hm_gb_cbound","cb_boost"] -algos_multiview: ["mumbo","mvml"] +type: ["monoview","multiview"] +algos_monoview: ["decision_tree","adaboost"] +algos_multiview: ["mumbo","mvml", 'lp_norm_mkl', 'mucombo', 'early_fusion_decision_tree', 'early_fusion_adaboost'] stats_iter: 1 metrics: - zero_one_loss: {} + accuracy_score: {} f1_score: {} -metric_princ: "zero_one_loss" +metric_princ: "accuracy_score" hps_type: "None" hps_args: - n_iter: 2 \ No newline at end of file + n_iter: 2 +mumbo: + base_estimator: + decision_tree: + max_depth: 3 \ No newline at end of file diff --git a/setup.py b/setup.py index c92d5a75..8f774ddf 100644 --- a/setup.py +++ b/setup.py @@ -94,6 +94,7 @@ def setup_package(): # ce qui est notre cas license="GNUGPL", + # Il y a encore une chiée de paramètres possibles, mais avec ça vous # couvrez 90% des besoins # ext_modules=cythonize( diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py index 6c75194a..a84014d3 100644 --- a/summit/multiview_platform/exec_classif.py +++ b/summit/multiview_platform/exec_classif.py @@ -548,7 +548,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, argument_dictionaries=None, benchmark=None, views=None, views_indices=None, flag=None, labels=None, - track_tracebacks=False): # pragma: no cover + track_tracebacks=False, nb_cores=1): # pragma: no cover results_monoview, labels_names = benchmark_init(directory, classification_indices, labels, @@ -564,7 +564,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, results_monoview += [ exec_monoview(directory, X, Y, args["name"], labels_names, classification_indices, k_folds, - 1, args["file_type"], args["pathf"], random_state, + nb_cores, args["file_type"], args["pathf"], random_state, hyper_param_search=hyper_param_search, metrics=metrics, **arguments)] @@ -679,7 +679,7 @@ def exec_benchmark(nb_cores, stats_iter, for arguments in benchmark_arguments_dictionaries: benchmark_results = exec_one_benchmark_mono_core( dataset_var=dataset_var, - track_tracebacks=track_tracebacks, + track_tracebacks=track_tracebacks, nb_cores=nb_cores, **arguments) analyze_iterations([benchmark_results], benchmark_arguments_dictionaries, stats_iter, diff --git a/summit/multiview_platform/metrics/balanced_accuracy.py b/summit/multiview_platform/metrics/balanced_accuracy.py new file mode 100644 index 00000000..6d4ab5d0 --- /dev/null +++ b/summit/multiview_platform/metrics/balanced_accuracy.py @@ -0,0 +1,41 @@ +"""Functions : + score: to get the accuracy score + get_scorer: returns a sklearn scorer for grid search +""" + +from sklearn.metrics import balanced_accuracy_score as metric +from sklearn.metrics import make_scorer + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +def score(y_true, y_pred, multiclass=False, **kwargs): + """Arguments: + y_true: real labels + y_pred: predicted labels + + Keyword Arguments: + "0": weights to compute accuracy + + Returns: + Weighted accuracy score for y_true, y_pred""" + score = metric(y_true, y_pred, **kwargs) + return score + + +def get_scorer(**kwargs): + """Keyword Arguments: + "0": weights to compute accuracy + + Returns: + A weighted sklearn scorer for accuracy""" + return make_scorer(metric, greater_is_better=True, + **kwargs) + + +def get_config(**kwargs): + config_string = "Balanced accuracy score using {}, (higher is better)".format( + kwargs) + return config_string diff --git a/summit/multiview_platform/monoview_classifiers/adaboost.py b/summit/multiview_platform/monoview_classifiers/adaboost.py index cd8ce3db..579b9ffd 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost.py @@ -35,7 +35,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): ) self.param_names = ["n_estimators", "base_estimator"] self.classed_params = ["base_estimator"] - self.distribs = [CustomRandint(low=1, high=500), + self.distribs = [CustomRandint(low=1, high=100), base_boosting_estimators] self.weird_strings = {"base_estimator": "class_name"} self.plotted_metric = metrics.zero_one_loss diff --git a/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py b/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py index f0fbd955..43589981 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py @@ -17,6 +17,7 @@ __status__ = "Prototype" # Production, Development, Prototype classifier_class_name = "AdaboostPregen" + class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, PregenClassifier): """ diff --git a/summit/multiview_platform/monoview_classifiers/samba.py b/summit/multiview_platform/monoview_classifiers/samba.py new file mode 100644 index 00000000..f43defd7 --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/samba.py @@ -0,0 +1,70 @@ +from SamBA.samba import NeighborHoodClassifier, ExpTrainWeighting +import numpy as np +from sklearn.tree import DecisionTreeClassifier +from SamBA.relevances import * +from SamBA.distances import * +from sklearn.preprocessing import RobustScaler + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +# class Decis +classifier_class_name = "SamBAClf" + +class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier): + + def __init__(self, base_estimator=DecisionTreeClassifier(max_depth=1, + splitter='best', + criterion='gini'), + n_estimators=2, + estimator_params=tuple(), + relevance=MarginRelevance(), + distance=EuclidianDist(), + train_weighting=ExpTrainWeighting(), + keep_selected_features=True, + normalizer=RobustScaler(), + b=2, + pred_train=False, + forced_diversity=True, + **kwargs): + """ + + Parameters + ---------- + random_state + model_type + max_rules + p + kwargs + """ + super(SamBAClf, self).__init__(base_estimator=base_estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + relevance=relevance, + distance=distance, + train_weighting=train_weighting, + keep_selected_features=keep_selected_features, + normalizer=normalizer, + forced_diversity=forced_diversity, + b=b,pred_train=pred_train) + self.param_names = ["n_estimators", "relevance", "distance", + "train_weighting", "b", "pred_train", "normalizer"] + self.distribs = [CustomRandint(low=1, high=30), + [ExpRelevance(), MarginRelevance()], + [EuclidianDist(), PolarDist(), ExpEuclidianDist()], + [ExpTrainWeighting()], + CustomUniform(0.5, 3), + [True, False], + [RobustScaler(), None]] + self.classed_params = [] + self.weird_strings = {} + + def get_interpretation(self, directory, base_file_name, y_test, multi_class=False): + interpret_string = self.get_feature_importance(directory, base_file_name) + return interpret_string diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_samba.py b/summit/multiview_platform/multiview_classifiers/early_fusion_samba.py new file mode 100644 index 00000000..7722f003 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_samba.py @@ -0,0 +1,40 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform +from ..monoview_classifiers.samba import SamBAClf +from SamBA.samba import * +from SamBA.distances import * +from SamBA.relevances import * +from sklearn.tree import DecisionTreeClassifier + +classifier_class_name = "EarlyFusionSamba" + + +class EarlyFusionSamba(BaseEarlyFusion): + + def __init__(self, random_state=None, base_estimator=DecisionTreeClassifier(max_depth=1, + splitter='best', + criterion='gini'), + n_estimators=2, + estimator_params=tuple(), + relevance=MarginRelevance(), + distance=EuclidianDist(), + train_weighting=ExpTrainWeighting(pred_train=True), + keep_selected_features=True, + normalizer=RobustScaler(), + pred_train=False, + b=2, + **kwargs): + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="samba", + base_estimator=base_estimator, estimator_params=estimator_params, + relevance=relevance, distance=distance, train_weighting=train_weighting, + keep_selected_features=keep_selected_features, normalizer=normalizer, + n_estimators=n_estimators, pred_train=pred_train, b=b, **kwargs) + self.param_names = ["n_estimators", "relevance", "distance", + "train_weighting", "b", "pred_train"] + self.distribs = [CustomRandint(low=1, high=30), + [ExpRelevance(), MarginRelevance()], + [EuclidianDist(), PolarDist(), ExpEuclidianDist()], + [ExpTrainWeighting(pred_train=True)], + CustomUniform(0.25, 3), + [True, False]] \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/mumbo.py b/summit/multiview_platform/multiview_classifiers/mumbo.py index e631cbc2..203228ba 100644 --- a/summit/multiview_platform/multiview_classifiers/mumbo.py +++ b/summit/multiview_platform/multiview_classifiers/mumbo.py @@ -47,6 +47,13 @@ class Mumbo(BaseMultiviewClassifier, MumboClassifier): """ if base_estimator is None: self.base_estimator = DecisionTreeClassifier() + elif type(base_estimator) is list: + if type(base_estimator[0]) is dict: + self.base_estimator = [self.set_base_estim_from_dict(estim) for estim in base_estimator] + elif isinstance(base_estimator[0], BaseEstimator): + self.base_estimator = base_estimator + else: + raise ValueError("base_estimator should ba a list of dict or a sklearn classifier list") elif isinstance(base_estimator, dict): self.base_estimator = self.set_base_estim_from_dict(base_estimator) MumboClassifier.set_params(self, **params) -- GitLab