From e5fbb2f9c03d025d059858fb78ba73191c9d3c62 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Tue, 28 Feb 2023 07:23:15 -0500 Subject: [PATCH] Added some clfs" --- .../monoview_classifiers/decision_tree.py | 9 +- .../monoview_classifiers/repeatboost.py | 127 ++++++++++++++++++ .../monoview_classifiers/samba.py | 5 +- .../monoview_classifiers/scm.py | 30 ++++- .../monoview_classifiers/scmboost.py | 93 +++++++++++++ .../multiview_classifiers/early_fusion_knn.py | 23 ++++ 6 files changed, 278 insertions(+), 9 deletions(-) create mode 100644 summit/multiview_platform/monoview_classifiers/repeatboost.py create mode 100644 summit/multiview_platform/monoview_classifiers/scmboost.py create mode 100644 summit/multiview_platform/multiview_classifiers/early_fusion_knn.py diff --git a/summit/multiview_platform/monoview_classifiers/decision_tree.py b/summit/multiview_platform/monoview_classifiers/decision_tree.py index f8392df4..00c56dee 100644 --- a/summit/multiview_platform/monoview_classifiers/decision_tree.py +++ b/summit/multiview_platform/monoview_classifiers/decision_tree.py @@ -18,21 +18,22 @@ class DecisionTree(DecisionTreeClassifier, BaseMonoviewClassifier): """ def __init__(self, random_state=None, max_depth=None, - criterion='gini', splitter='best', **kwargs): + criterion='gini', splitter='best', class_weight='balanced', + **kwargs): DecisionTreeClassifier.__init__(self, max_depth=max_depth, criterion=criterion, - class_weight="balanced", splitter=splitter, + class_weight=class_weight, random_state=random_state ) - self.param_names = ["max_depth", "criterion", "splitter", + self.param_names = ["max_depth", "criterion", "splitter", "class_weight", 'random_state'] self.classed_params = [] self.distribs = [CustomRandint(low=1, high=300), ["gini", "entropy"], - ["best", "random"], [random_state]] + ["best", "random"],["balanced", None] ,[random_state]] self.weird_strings = {} def get_interpretation(self, directory, base_file_name, y_test, feature_ids, diff --git a/summit/multiview_platform/monoview_classifiers/repeatboost.py b/summit/multiview_platform/monoview_classifiers/repeatboost.py new file mode 100644 index 00000000..cec9230c --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/repeatboost.py @@ -0,0 +1,127 @@ +import os +import time + +import numpy as np +from sklearn.ensemble import AdaBoostClassifier + + +from .. import metrics +from ..monoview.monoview_utils import BaseMonoviewClassifier, get_accuracy_graph +from summit.multiview_platform.utils.hyper_parameter_search import CustomRandint +from ..monoview_classifiers.decision_tree import DecisionTree +from ..utils.base import base_boosting_estimators + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + +classifier_class_name = "REPboost" + + +class RepDT(DecisionTree): + + def fit(self, X, y, sample_weight=None, check_input=True): + if sample_weight is not None: + new_X, new_y = self.fake_repetitions(X, y, sample_weight, + precision=5) + else: + new_X = X + new_y = y + DecisionTree.fit(self, new_X, new_y) + return self + + def fake_repetitions(self, X, y, sample_weight, precision=3): + sample_repetitions = (np.round(sample_weight, + precision) * 10 ** precision).astype( + np.int64) + for ind, sample_rep in enumerate(sample_repetitions): + if sample_rep == 0: + sample_repetitions[ind] = 1 + gcd = np.gcd.reduce(sample_repetitions) + sample_repetitions = (sample_repetitions / gcd).astype(np.int64) + new_X = np.zeros( + (X.shape[0]+ int(np.sum(sample_repetitions)-len(sample_repetitions)), X.shape[1])) + new_y = np.zeros(X.shape[0]+ int(np.sum(sample_repetitions)-len(sample_repetitions))) + ind = 0 + for sample_index, (sample_rep, sample, label) in enumerate( + zip(sample_repetitions, X, y)): + new_X[ind:ind + sample_rep, :] = sample + new_y[ind:ind + sample_rep] = label + ind += sample_rep + return new_X, new_y + +class REPboost(AdaBoostClassifier, BaseMonoviewClassifier): + """ + This class is an adaptation of scikit-learn's `AdaBoostClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier>`_ + + """ + + def __init__(self, random_state=None, n_estimators=50, + base_estimator=RepDT(max_depth=1), + base_estimator_config=None, **kwargs): + # base_estimator = BaseMonoviewClassifier.get_base_estimator(self, + # base_estimator, + # base_estimator_config) + AdaBoostClassifier.__init__(self, + random_state=random_state, + n_estimators=n_estimators, + base_estimator=base_estimator, + algorithm="SAMME" + ) + self.param_names = ["n_estimators",] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=100),] + self.weird_strings = {} + self.plotted_metric = metrics.zero_one_loss + self.plotted_metric_name = "zero_one_loss" + self.base_estimator_config = base_estimator_config + self.step_predictions = None + + def fit(self, X, y, sample_weight=None): + begin = time.time() + AdaBoostClassifier.fit(self, X, y) + end = time.time() + self.train_time = end - begin + self.train_shape = X.shape + self.base_predictions = np.array( + [estim.predict(X) for estim in self.estimators_]) + self.metrics = np.array([self.plotted_metric.score(pred, y) for pred in + self.staged_predict(X)]) + return self + + def predict(self, X): + begin = time.time() + pred = AdaBoostClassifier.predict(self, X) + end = time.time() + self.pred_time = end - begin + self.step_predictions = np.array( + [step_pred for step_pred in self.staged_predict(X)]) + return pred + + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, + multi_class=False): # pragma: no cover + interpretString = "" + # interpretString += self.get_feature_importance(directory, + # base_file_name, + # feature_ids) + # interpretString += "\n\n Estimator error | Estimator weight\n" + # interpretString += "\n".join( + # [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for + # error, weight in + # zip(self.estimator_errors_, self.estimator_weights_)]) + # step_test_metrics = np.array( + # [self.plotted_metric.score(y_test, step_pred) for step_pred in + # self.step_predictions]) + # get_accuracy_graph(step_test_metrics, "Adaboost", + # os.path.join(directory, + # base_file_name + "test_metrics.png"), + # self.plotted_metric_name, set="test") + # np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), + # step_test_metrics, + # delimiter=',') + # np.savetxt( + # os.path.join(directory, base_file_name + "train_metrics.csv"), + # self.metrics, delimiter=',') + # np.savetxt(os.path.join(directory, base_file_name + "times.csv"), + # np.array([self.train_time, self.pred_time]), delimiter=',') + return interpretString diff --git a/summit/multiview_platform/monoview_classifiers/samba.py b/summit/multiview_platform/monoview_classifiers/samba.py index a69de937..a2ca6a30 100644 --- a/summit/multiview_platform/monoview_classifiers/samba.py +++ b/summit/multiview_platform/monoview_classifiers/samba.py @@ -61,7 +61,7 @@ class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier): "relevance", "distance", "train_weighting", "b", "pred_train", "normalizer", - "normalize_dists", "a", "class_weight"] + "normalize_dists", "a", "class_weight", "base_estimator"] self.distribs = [CustomRandint(low=1, high=70), [ExpRelevance()], [EuclidianDist(), PolarDist(), ExpEuclidianDist(), Jaccard()], @@ -70,7 +70,8 @@ class SamBAClf(NeighborHoodClassifier, BaseMonoviewClassifier): [True, False], [RobustScaler()], [True], CustomRandint(0, 10, 'e-'), - ["balanced", None]] + ["balanced", None], + ] self.classed_params = [] self.weird_strings = {} diff --git a/summit/multiview_platform/monoview_classifiers/scm.py b/summit/multiview_platform/monoview_classifiers/scm.py index d094ceec..4ee442f1 100644 --- a/summit/multiview_platform/monoview_classifiers/scm.py +++ b/summit/multiview_platform/monoview_classifiers/scm.py @@ -63,9 +63,14 @@ class SCM(scm, BaseMonoviewClassifier): self.classed_params = [] self.weird_strings = {} - def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params): - self.n_features = X.shape[1] - scm.fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params) + def fit(self, X, y, tiebreaker=None, iteration_callback=None, sample_weight=None, **fit_params): + if sample_weight is not None: + new_X, new_y = self.fake_repetitions(X, y, sample_weight, precision=5) + else: + new_X = X + new_y = y + self.n_features = new_X.shape[1] + scm.fit(self, new_X, new_y, tiebreaker=None, iteration_callback=None, **fit_params) self.feature_importances_ = np.zeros(self.n_features) # sum the rules importances : # rules_importances = estim.get_rules_importances() #activate it when pyscm will implement importance @@ -76,6 +81,25 @@ class SCM(scm, BaseMonoviewClassifier): self.feature_importances_ /= np.sum(self.feature_importances_) return self + def fake_repetitions(self, X, y, sample_weight, precision=3): + sample_repetitions = (np.round(sample_weight, precision)*10**precision).astype(np.int64) + for ind, sample_rep in enumerate(sample_repetitions): + if sample_rep==0: + sample_repetitions[ind] = 1 + gcd = np.gcd.reduce(sample_repetitions) + sample_repetitions = (sample_repetitions/gcd).astype(np.int64) + new_X = np.zeros((X.shape[0]+ int(np.sum(sample_repetitions)-len(sample_repetitions)), X.shape[1])) + new_y = np.zeros(X.shape[0]+ int(np.sum(sample_repetitions)-len(sample_repetitions))) + ind=0 + for sample_index, (sample_rep, sample, label) in enumerate(zip(sample_repetitions, X, y)): + new_X[ind:ind+sample_rep, :] = sample + new_y[ind:ind+sample_rep] =label + ind+=sample_rep + return new_X, new_y + + + + # def canProbas(self): # """ # Used to know if the classifier can return label probabilities diff --git a/summit/multiview_platform/monoview_classifiers/scmboost.py b/summit/multiview_platform/monoview_classifiers/scmboost.py new file mode 100644 index 00000000..73db0419 --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/scmboost.py @@ -0,0 +1,93 @@ +import os +import time + +import numpy as np +from sklearn.ensemble import AdaBoostClassifier + + +from .. import metrics +from ..monoview.monoview_utils import BaseMonoviewClassifier, get_accuracy_graph +from summit.multiview_platform.utils.hyper_parameter_search import CustomRandint, CustomUniform +from ..monoview_classifiers.scm import SCM +from ..utils.base import base_boosting_estimators + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + +classifier_class_name = "SCMboost" + + +class SCMboost(AdaBoostClassifier, BaseMonoviewClassifier): + """ + This class is an adaptation of scikit-learn's `AdaBoostClassifier <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html#sklearn.ensemble.AdaBoostClassifier>`_ + + """ + + def __init__(self, random_state=None, n_estimators=50, + base_estimator=SCM(p=0.49, max_rules=1, model_type="disjunction"), + base_estimator_config=None, **kwargs): + if "base_estimator__p" in kwargs: + base_estimator.p = kwargs["base_estimator__p"] + AdaBoostClassifier.__init__(self, + random_state=random_state, + n_estimators=n_estimators, + base_estimator=base_estimator, + algorithm="SAMME",) + self.param_names = ["n_estimators", "base_estimator__p"] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=100), CustomUniform(loc=0, state=1)] + self.weird_strings = {} + self.plotted_metric = metrics.zero_one_loss + self.plotted_metric_name = "zero_one_loss" + self.base_estimator_config = base_estimator_config + self.step_predictions = None + + def fit(self, X, y, sample_weight=None): + begin = time.time() + AdaBoostClassifier.fit(self, X, y) + end = time.time() + self.train_time = end - begin + self.train_shape = X.shape + self.base_predictions = np.array( + [estim.predict(X) for estim in self.estimators_]) + self.metrics = np.array([self.plotted_metric.score(pred, y) for pred in + self.staged_predict(X)]) + return self + + def predict(self, X): + begin = time.time() + pred = AdaBoostClassifier.predict(self, X) + end = time.time() + self.pred_time = end - begin + self.step_predictions = np.array( + [step_pred for step_pred in self.staged_predict(X)]) + return pred + + def get_interpretation(self, directory, base_file_name, y_test, feature_ids, + multi_class=False): # pragma: no cover + interpretString = "" + # interpretString += self.get_feature_importance(directory, + # base_file_name, + # feature_ids) + # interpretString += "\n\n Estimator error | Estimator weight\n" + # interpretString += "\n".join( + # [str(error) + " | " + str(weight / sum(self.estimator_weights_)) for + # error, weight in + # zip(self.estimator_errors_, self.estimator_weights_)]) + # step_test_metrics = np.array( + # [self.plotted_metric.score(y_test, step_pred) for step_pred in + # self.step_predictions]) + # get_accuracy_graph(step_test_metrics, "Adaboost", + # os.path.join(directory, + # base_file_name + "test_metrics.png"), + # self.plotted_metric_name, set="test") + # np.savetxt(os.path.join(directory, base_file_name + "test_metrics.csv"), + # step_test_metrics, + # delimiter=',') + # np.savetxt( + # os.path.join(directory, base_file_name + "train_metrics.csv"), + # self.metrics, delimiter=',') + # np.savetxt(os.path.join(directory, base_file_name + "times.csv"), + # np.array([self.train_time, self.pred_time]), delimiter=',') + return interpretString diff --git a/summit/multiview_platform/multiview_classifiers/early_fusion_knn.py b/summit/multiview_platform/multiview_classifiers/early_fusion_knn.py new file mode 100644 index 00000000..63b37853 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/early_fusion_knn.py @@ -0,0 +1,23 @@ +from .additions.early_fusion_from_monoview import BaseEarlyFusion +from ..utils.hyper_parameter_search import CustomUniform, CustomRandint + +classifier_class_name = "EarlyFusionSVMRBF" + + +class EarlyFusionSVMRBF(BaseEarlyFusion): + + def __init__(self, random_state=None, n_neighbors=5, + weights='uniform', algorithm='auto', p=2, **kwargs): + BaseEarlyFusion.__init__(self, random_state=random_state, + monoview_classifier="knn",n_neighbors=n_neighbors, + weights=weights, + algorithm=algorithm, + p=p, **kwargs) + self.param_names = ["n_neighbors", "weights", "algorithm", "p", + "random_state", ] + self.classed_params = [] + self.distribs = [CustomRandint(low=1, high=10), ["uniform", "distance"], + ["auto", "ball_tree", "kd_tree", "brute"], [1, 2], + [random_state]] + self.weird_strings = {} + self.random_state = random_state \ No newline at end of file -- GitLab