diff --git a/config_files/config_cuisine.yml b/config_files/config_cuisine.yml index 5a9161460377d8e9b461acc0d4d81ea38a12f0c4..9206cf2f6dc7cc55f1828726ea102540e2bb309d 100644 --- a/config_files/config_cuisine.yml +++ b/config_files/config_cuisine.yml @@ -1,10 +1,10 @@ # The base configuration of the benchmark log: True -name: ["ionosphere", "abalone", "australian", "balance", "bupa", "cylinder", "hepatitis", "pima", "yeast", "zoo"] -label: "comp_1" +name: ['tnbc_mazid'] +label: "" file_type: ".hdf5" views: -pathf: "/home/baptiste/Documents/Datasets/UCI/both/" +pathf: "/home/baptiste/Documents/Datasets/Mazid/" nice: 0 random_state: 42 nb_cores: 1 @@ -13,34 +13,53 @@ debug: True add_noise: False noise_std: 0.0 res_dir: "../results/" -track_tracebacks: False +track_tracebacks: True # All the classification-realted configuration options multiclass_method: "oneVersusOne" -split: 0.50 +split: 0.30 nb_folds: 5 nb_class: 2 classes: -type: ["monoview",] -algos_monoview: ["cb_boost", "self_opt_cb", "adaboost", "cq_boost", "min_cq", "adaboost_pregen", "self_opt_cb_pseudo", "self_opt_cb_root"] -algos_multiview: ["mv_cb_boost","early_fusion_dt", "early_fusion_cb", "early_fusion_rf","mumbo", "early_fusion_svm" ] +type: ["monoview","multiview"] +algos_monoview: ["samba", "scm_bagging", "random_forest", "adaboost", 'scm'] +algos_multiview: ["early_fusion_adaboost", "early_fusion_decision_tree", "early_fusion_random_forest", "early_fusion_samba"] stats_iter: 5 metrics: - accuracy_score: {} + balanced_accuracy: {} f1_score: average: 'micro' -metric_princ: "accuracy_score" -hps_type: "None" + accuracy_score: {} +metric_princ: "balanced_accuracy" +hps_type: "Random" hps_args: - n_iter: 30 - equivalent_draws: True + n_iter: 20 + equivalent_draws: False svm_rbf: C: 0.7 +scm_bagging: + {max_features: 0.908115713423863, max_rules: 9, max_samples: 0.9277949143533335, model_type: conjunction, + n_estimators: 109, p_options: 0.7823433255515356} + +samba: + n_estimators: 22 + +adaboost: + {base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, base_estimator__class_weight: null, + base_estimator__criterion: gini, base_estimator__max_depth: 5, base_estimator__max_features: null, + base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0, + base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, base_estimator__min_samples_split: 2, + base_estimator__min_weight_fraction_leaf: 0.0, base_estimator__random_state: null, + base_estimator__splitter: best, n_estimators: 354} + +svm_linear: + C: 0.3867 + cb_boost: n_stumps: 1 - n_max_iterations: 10 + n_max_iterations: 20 estimators_generator: "Stumps" cq_boost: @@ -50,15 +69,42 @@ cq_boost: min_cq: n_stumps_per_attribute: 1 -adaboost: - n_estimators: 10 +decision_tree: + {criterion: entropy, max_depth: 271, splitter: random} -adaboost_pregen: - n_estimators: 10 - n_stumps: 1 +early_fusion_adaboost: + {base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, base_estimator__class_weight: null, + base_estimator__criterion: gini, base_estimator__max_depth: 5, base_estimator__max_features: null, + base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0, + base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, base_estimator__min_samples_split: 2, + base_estimator__min_weight_fraction_leaf: 0.0, base_estimator__random_state: null, + base_estimator__splitter: best, base_estimator_config: null, n_estimators: 273} -decision_tree: - max_depth: 2 +early_fusion_decision_tree: + {criterion: entropy, max_depth: 293, splitter: random} + +early_fusion_random_forest: + {criterion: gini, max_depth: 8, n_estimators: 46} + +random_forest: + {criterion: gini, max_depth: 8, n_estimators: 32} + +weighted_linear_late_fusion: + classifier_configs: + - decision_tree: {criterion: entropy, max_depth: 112, splitter: random} + - adaboost: {base_estimator: DecisionTreeClassifier, base_estimator__ccp_alpha: 0.0, + base_estimator__class_weight: null, base_estimator__criterion: gini, base_estimator__max_depth: 2, + base_estimator__max_features: null, base_estimator__max_leaf_nodes: null, base_estimator__min_impurity_decrease: 0.0, + base_estimator__min_impurity_split: null, base_estimator__min_samples_leaf: 1, + base_estimator__min_samples_split: 2, base_estimator__min_weight_fraction_leaf: 0.0, + base_estimator__random_state: null, base_estimator__splitter: best, n_estimators: 400} + classifiers_names: [decision_tree, adaboost] + nb_cores: 1 + rs: 724 + weights: [0.9636627605010293, 0.3834415188257777] + +scm: + {max_rules: 10, model_type: conjunction, p: 0.8310271995093625} mumbo: base_estimator: diff --git a/config_files/config_private_algos.yml b/config_files/config_private_algos.yml index bc745606d059bafd60162437e24c16934629dc95..a676749a7419ffc5a2e1b17ca7a30d9405218d5d 100644 --- a/config_files/config_private_algos.yml +++ b/config_files/config_private_algos.yml @@ -1,14 +1,14 @@ # The base configuration of the benchmark log: True -name: ["mnist_0_9_train"] +name: ["multiview_mnist"] label: "_" file_type: ".hdf5" -views: ["NIMST_data", ] -pathf: "/home/baptiste/Documents/Datasets/MNist/" +views: +pathf: "examples/data/" nice: 0 random_state: 43 nb_cores: 1 -full: False +full: True debug: True add_noise: False noise_std: 0.0 @@ -19,16 +19,20 @@ track_tracebacks: False multiclass_method: "oneVersusOne" split: 0.96 nb_folds: 5 -nb_class: 2 +nb_class: classes: -type: ["monoview",] -algos_monoview: ["hm_gb_cbound","cb_boost"] -algos_multiview: ["mumbo","mvml"] +type: ["monoview","multiview"] +algos_monoview: ["decision_tree","adaboost"] +algos_multiview: ["mumbo","mvml", 'lp_norm_mkl', 'mucombo', 'early_fusion_decision_tree', 'early_fusion_adaboost'] stats_iter: 1 metrics: - zero_one_loss: {} + accuracy_score: {} f1_score: {} -metric_princ: "zero_one_loss" +metric_princ: "accuracy_score" hps_type: "None" hps_args: - n_iter: 2 \ No newline at end of file + n_iter: 2 +mumbo: + base_estimator: + decision_tree: + max_depth: 3 \ No newline at end of file diff --git a/setup.py b/setup.py index c92d5a751bc330b121109cc60e2453977cce2baf..8f774ddfe5cfdbf51e69dd924a69ea1453dfba24 100644 --- a/setup.py +++ b/setup.py @@ -94,6 +94,7 @@ def setup_package(): # ce qui est notre cas license="GNUGPL", + # Il y a encore une chiée de paramètres possibles, mais avec ça vous # couvrez 90% des besoins # ext_modules=cythonize( diff --git a/summit/multiview_platform/exec_classif.py b/summit/multiview_platform/exec_classif.py index 6c75194aaf2c10a58cfdd38ecd9e2fa8e071b27b..a84014d3d05f835452255ee7ec858a83e772f539 100644 --- a/summit/multiview_platform/exec_classif.py +++ b/summit/multiview_platform/exec_classif.py @@ -548,7 +548,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, argument_dictionaries=None, benchmark=None, views=None, views_indices=None, flag=None, labels=None, - track_tracebacks=False): # pragma: no cover + track_tracebacks=False, nb_cores=1): # pragma: no cover results_monoview, labels_names = benchmark_init(directory, classification_indices, labels, @@ -564,7 +564,7 @@ def exec_one_benchmark_mono_core(dataset_var=None, labels_dictionary=None, results_monoview += [ exec_monoview(directory, X, Y, args["name"], labels_names, classification_indices, k_folds, - 1, args["file_type"], args["pathf"], random_state, + nb_cores, args["file_type"], args["pathf"], random_state, hyper_param_search=hyper_param_search, metrics=metrics, **arguments)] @@ -679,7 +679,7 @@ def exec_benchmark(nb_cores, stats_iter, for arguments in benchmark_arguments_dictionaries: benchmark_results = exec_one_benchmark_mono_core( dataset_var=dataset_var, - track_tracebacks=track_tracebacks, + track_tracebacks=track_tracebacks, nb_cores=nb_cores, **arguments) analyze_iterations([benchmark_results], benchmark_arguments_dictionaries, stats_iter, diff --git a/summit/multiview_platform/metrics/balanced_accuracy.py b/summit/multiview_platform/metrics/balanced_accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..6d4ab5d0c93ab45b50bdb4f8f8672979321f072b --- /dev/null +++ b/summit/multiview_platform/metrics/balanced_accuracy.py @@ -0,0 +1,41 @@ +"""Functions : + score: to get the accuracy score + get_scorer: returns a sklearn scorer for grid search +""" + +from sklearn.metrics import balanced_accuracy_score as metric +from sklearn.metrics import make_scorer + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +def score(y_true, y_pred, multiclass=False, **kwargs): + """Arguments: + y_true: real labels + y_pred: predicted labels + + Keyword Arguments: + "0": weights to compute accuracy + + Returns: + Weighted accuracy score for y_true, y_pred""" + score = metric(y_true, y_pred, **kwargs) + return score + + +def get_scorer(**kwargs): + """Keyword Arguments: + "0": weights to compute accuracy + + Returns: + A weighted sklearn scorer for accuracy""" + return make_scorer(metric, greater_is_better=True, + **kwargs) + + +def get_config(**kwargs): + config_string = "Balanced accuracy score using {}, (higher is better)".format( + kwargs) + return config_string diff --git a/summit/multiview_platform/monoview_classifiers/adaboost.py b/summit/multiview_platform/monoview_classifiers/adaboost.py index cd8ce3db0b769e7ad99032487d94da010988138b..579b9ffd252db73fb9568c2c7937f6d217600f05 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost.py @@ -35,7 +35,7 @@ class Adaboost(AdaBoostClassifier, BaseMonoviewClassifier): ) self.param_names = ["n_estimators", "base_estimator"] self.classed_params = ["base_estimator"] - self.distribs = [CustomRandint(low=1, high=500), + self.distribs = [CustomRandint(low=1, high=100), base_boosting_estimators] self.weird_strings = {"base_estimator": "class_name"} self.plotted_metric = metrics.zero_one_loss diff --git a/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py b/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py index f0fbd9551070b99db9276a5899d3502b99204b36..43589981b438e1a12cc4b54f2cdb956c63f15a8e 100644 --- a/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py +++ b/summit/multiview_platform/monoview_classifiers/adaboost_pregen.py @@ -17,6 +17,7 @@ __status__ = "Prototype" # Production, Development, Prototype classifier_class_name = "AdaboostPregen" + class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, PregenClassifier): """ diff --git a/summit/multiview_platform/monoview_classifiers/samba.py b/summit/multiview_platform/monoview_classifiers/samba.py new file mode 100644 index 0000000000000000000000000000000000000000..f43defd792ec642a105b71b3b393ea2a9cdee7cb --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/samba.py @@ -0,0 +1,70 @@ +from SamBA.samba import NeighborHoodClassifier, ExpTrainWeighting +import numpy as np +from sklearn.tree import DecisionTreeClassifier +from SamBA.relevances import * +from SamBA.distances import * +from sklearn.preprocessing import RobustScaler + +from ..monoview.monoview_utils import BaseMonoviewClassifier +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform + + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +# class Decis +classifier_class_name = "SamBAClf" + +class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier): + + def __init__(self, base_estimator=DecisionTreeClassifier(max_depth=1, + splitter='best', + criterion='gini'), + n_estimators=2, + estimator_params=tuple(), + relevance=MarginRelevance(), + distance=EuclidianDist(), + train_weighting=ExpTrainWeighting(), + keep_selected_features=True, + normalizer=RobustScaler(), + b=2, + pred_train=False, + forced_diversity=True, + **kwargs): + """ + + Parameters + ---------- + random_state + model_type + max_rules + p + kwargs + """ + super(SamBAClf, self).__init__(base_estimator=base_estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + relevance=relevance, + distance=distance, + train_weighting=train_weighting, + keep_selected_features=keep_selected_features, + normalizer=normalizer, + forced_diversity=forced_diversity, + b=b,pred_train=pred_train) + self.param_names = ["n_estimators", "relevance", "distance", + "train_weighting", "b", "pred_train", "normalizer"] + self.distribs = [CustomRandint(low=1, high=30), + [ExpRelevance(), MarginRelevance()], + [EuclidianDist(), PolarDist(), ExpEuclidianDist()], + [ExpTrainWeighting()], + CustomUniform(0.5, 3), + [True, False], + [RobustScaler(), None]] + self.classed_params = [] + self.weird_strings = {} + + def get_interpretation(self, directory, base_file_name, y_test, multi_class=False): + interpret_string = self.get_feature_importance(directory, base_file_name) + return interpret_string diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_samba.py b/summit/multiview_platform/multiview_classifiers/early_fusion_samba.py new file mode 100644 index 0000000000000000000000000000000000000000..7722f00336dc90dd7906315f3a0857bbf3c7a79e --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_samba.py @@ -0,0 +1,40 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomRandint, CustomUniform +from ..monoview_classifiers.samba import SamBAClf +from SamBA.samba import * +from SamBA.distances import * +from SamBA.relevances import * +from sklearn.tree import DecisionTreeClassifier + +classifier_class_name = "EarlyFusionSamba" + + +class EarlyFusionSamba(BaseEarlyFusion): + + def __init__(self, random_state=None, base_estimator=DecisionTreeClassifier(max_depth=1, + splitter='best', + criterion='gini'), + n_estimators=2, + estimator_params=tuple(), + relevance=MarginRelevance(), + distance=EuclidianDist(), + train_weighting=ExpTrainWeighting(pred_train=True), + keep_selected_features=True, + normalizer=RobustScaler(), + pred_train=False, + b=2, + **kwargs): + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="samba", + base_estimator=base_estimator, estimator_params=estimator_params, + relevance=relevance, distance=distance, train_weighting=train_weighting, + keep_selected_features=keep_selected_features, normalizer=normalizer, + n_estimators=n_estimators, pred_train=pred_train, b=b, **kwargs) + self.param_names = ["n_estimators", "relevance", "distance", + "train_weighting", "b", "pred_train"] + self.distribs = [CustomRandint(low=1, high=30), + [ExpRelevance(), MarginRelevance()], + [EuclidianDist(), PolarDist(), ExpEuclidianDist()], + [ExpTrainWeighting(pred_train=True)], + CustomUniform(0.25, 3), + [True, False]] \ No newline at end of file diff --git a/summit/multiview_platform/multiview_classifiers/mumbo.py b/summit/multiview_platform/multiview_classifiers/mumbo.py index e631cbc2ff0c053b82b055595e977d6a9844bc74..203228ba795338ae913682aea19aec0a1c164025 100644 --- a/summit/multiview_platform/multiview_classifiers/mumbo.py +++ b/summit/multiview_platform/multiview_classifiers/mumbo.py @@ -47,6 +47,13 @@ class Mumbo(BaseMultiviewClassifier, MumboClassifier): """ if base_estimator is None: self.base_estimator = DecisionTreeClassifier() + elif type(base_estimator) is list: + if type(base_estimator[0]) is dict: + self.base_estimator = [self.set_base_estim_from_dict(estim) for estim in base_estimator] + elif isinstance(base_estimator[0], BaseEstimator): + self.base_estimator = base_estimator + else: + raise ValueError("base_estimator should ba a list of dict or a sklearn classifier list") elif isinstance(base_estimator, dict): self.base_estimator = self.set_base_estim_from_dict(base_estimator) MumboClassifier.set_params(self, **params)