diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py index ca2cf70442dd125eba2b1ac50c944dcf07d34fb8..2002b3e81d36a421a5973aa981a515ca81a32112 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py @@ -61,6 +61,7 @@ class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): """ check_is_fitted(self, 'classes_') + import pdb;pdb.set_trace() return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) def predict_proba(self, X): @@ -191,6 +192,8 @@ class StumpsClassifiersGenerator(ClassifiersGenerator): """ minimums = np.min(X, axis=0) maximums = np.max(X, axis=0) + if y.ndim > 1: + y = np.reshape(y, (y.shape[0], )) ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) if self.check_diff: nb_differents = [np.unique(col) for col in np.transpose(X)] @@ -226,7 +229,6 @@ class StumpsClassifiersGenerator(ClassifiersGenerator): self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) if ranges[i] != 0] - self.estimators_ = np.asarray(self.estimators_) return self diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/QarBoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/QarBoostUtils.py index f9d7917dc9712a3b0e10200fc972c8bd4477bfdd..0db72182653e7da4d6c9c753b81d4dd677c460fe 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/QarBoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/QarBoostUtils.py @@ -11,6 +11,7 @@ import matplotlib.pyplot as plt from .BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, \ getInterpretBase, get_accuracy_graph +from ..MonoviewUtils import change_label_to_zero, change_label_to_minus from ... import Metrics # Used for QarBoost and CGreed @@ -328,9 +329,9 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): logging.info('Converting to dense matrix.') X = np.array(X.todense()) # Initialization - y[y == 0] = -1 - y = y.reshape((y.shape[0], 1)) - return X, y + y_neg = change_label_to_minus(y) + y_neg = y_neg.reshape((y.shape[0], 1)) + return X, y_neg def init_hypotheses(self, X, y): """Inintialization for the hyptotheses used to build the boosted vote""" diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py index 3367e44303812381424339357a4b9f298a36d057..44f12edbac25402773da287392e8a75cbeb1fe4a 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/MonoviewUtils.py @@ -46,6 +46,16 @@ def randomizedSearch(X_train, y_train, randomState, outputFileName, classifierMo testFoldsPreds = genTestFoldsPreds(X_train, y_train, KFolds, best_estimator) return bestParams, testFoldsPreds +def change_label_to_minus(y): + minus_y = np.copy(y) + minus_y[np.where(y==0)]=-1 + return minus_y + +def change_label_to_zero(y): + zeroed_y = np.copy(y) + zeroed_y[np.where(y==-1)]=0 + return zeroed_y + def compute_possible_combinations(params_dict): n_possibs = np.ones(len(params_dict))*np.inf diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostGraalpy.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostGraalpy.py new file mode 100644 index 0000000000000000000000000000000000000000..7f0251a350feae7e6b11ac16f8b7859b09dedf8a --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostGraalpy.py @@ -0,0 +1,192 @@ +import logging +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.validation import check_is_fitted + +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint, BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero +from ..Monoview.Additions.BoostUtils import getInterpretBase, StumpsClassifiersGenerator, BaseBoost +from ..Metrics import zero_one_loss + +class AdaBoostGP(BaseEstimator, ClassifierMixin, BaseBoost): + """Scikit-Learn compatible AdaBoost classifier. Original code by Pascal Germain, adapted by Jean-Francis Roy. + + Attributes + ---------- + n_iterations : int, optional + The number of iterations of the algorithm. Defaults to 200. + iterations_to_collect_as_hyperparameters : list + Iteration numbers to collect while learning, that will be converted as hyperparameter values at evaluation time. + Defaults to None. + classifiers_generator : Transformer, optional + A transformer to convert input samples in voters' outputs. Default: Decision stumps transformer, with 10 stumps + per attributes. + callback_function : function, optional + A function to call at each iteration that is supplied learning information. Defaults to None. + + """ + def __init__(self, n_iterations=200, iterations_to_collect_as_hyperparameters=True, classifiers_generator=None, callback_function=None, n_stumps=10, self_complemented=True): + self.n_iterations = n_iterations + self.n_stumps=n_stumps + self.iterations_to_collect_as_hyperparameters = iterations_to_collect_as_hyperparameters + self.estimators_generator = classifiers_generator + self.callback_function = callback_function + self.self_complemented = self_complemented + + def fit(self, X, y): + """Fits the algorithm on training data. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The input data. + y : ndarray of shape (n_samples, ) + The input labels. + + Returns + ------- + self + + """ + y_neg = change_label_to_minus(y) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=self.self_complemented) + + + # Step 1: We fit the classifiers generator and get its classification matrix. + self.estimators_generator.fit(X, y_neg) + # hint: This is equivalent to construct a new X + classification_matrix = self._binary_classification_matrix(X) + + n_samples, n_voters = classification_matrix.shape + # logging.debug("n_voters = {}".format(n_voters)) + + # Step 2: We initialize the weights on the samples and the weak classifiers. + sample_weights = np.ones(n_samples) / n_samples + alpha_weights = np.zeros(n_voters) + self.losses = [] + + # Step 3: We loop for each iteration. + self.collected_weight_vectors_ = [] + for t in range(self.n_iterations): + + # Step 4: We find the classifier that maximizes the success, weighted by the sample weights. + classifier_successes = np.dot(classification_matrix.T, sample_weights * y_neg) + + best_voter_index = np.argmax(classifier_successes) + success = classifier_successes[best_voter_index] + + if success >= 1.0: + logging.info("AdaBoost stopped : perfect classifier found!") + self.weights_ = np.zeros(n_voters) + self.weights_[best_voter_index] = 1.0 + return self + + # Step 5: We calculate the alpha_t parameter and update the alpha weights. + alpha = 0.5 * np.log((1.0 + success) / (1.0 - success)) + alpha_weights[best_voter_index] += alpha + + # logging.debug("{} : {}".format(t, str(alpha))) + + # Step 6: We update the sample weights. + sample_weights *= np.exp(-1 * alpha * y_neg * classification_matrix[:, best_voter_index]) + + normalization_constant = sample_weights.sum() + sample_weights = sample_weights / normalization_constant + + # We collect iteration information for later evaluation. + if self.iterations_to_collect_as_hyperparameters: + weights = alpha_weights / np.sum(alpha_weights) + self.collected_weight_vectors_.append(weights.copy()) + + loss = zero_one_loss.score(y_neg, np.sign(np.sum( + np.multiply(classification_matrix, + alpha_weights / np.sum(alpha_weights)), axis=1))) + self.losses.append(loss) + + if self.callback_function is not None: + self.callback_function(t, alpha_weights, normalization_constant, self.estimators_generator, self.weights_) + + self.weights_ = alpha_weights / np.sum(alpha_weights) + self.losses = np.array(self.losses) + self.learner_info_ = {'n_nonzero_weights': np.sum(self.weights_ > 1e-12)} + + return self + + def predict(self, X): + """Predict inputs using the fit classifier. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + The data to classify. + + Returns + ------- + predictions : ndarray of shape (n_samples, ) + The estimated labels. + + """ + check_is_fitted(self, 'weights_') + classification_matrix = self._binary_classification_matrix(X) + + if self.iterations_to_collect_as_hyperparameters: + self.test_preds = [] + for weight_vector in self.collected_weight_vectors_: + preds = np.sum(np.multiply(classification_matrix, + weight_vector), axis=1) + self.test_preds.append(change_label_to_zero(np.sign(preds))) + self.test_preds = np.array(self.test_preds) + margins = np.squeeze(np.asarray(np.dot(classification_matrix, self.weights_))) + return change_label_to_zero(np.array([int(x) for x in np.sign(margins)])) + + +class AdaboostGraalpy(AdaBoostGP, BaseMonoviewClassifier): + + def __init__(self, random_state=None, n_iterations=200, n_stumps=10, **kwargs): + super(AdaboostGraalpy, self).__init__( + n_iterations=n_iterations, + n_stumps=n_stumps + ) + self.param_names = [] + self.distribs = [] + self.classed_params = [] + self.weird_strings = {} + self.n_stumps = n_stumps + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def getInterpret(self, directory, y_test): + np.savetxt(directory + "train_metrics.csv", self.losses, delimiter=',') + np.savetxt(directory + "y_test_step.csv", self.test_preds, + delimiter=',') + step_metrics = [] + for step_index in range(self.test_preds.shape[0] - 1): + step_metrics.append(zero_one_loss.score(y_test, + self.test_preds[step_index, + :])) + step_metrics = np.array(step_metrics) + np.savetxt(directory + "step_test_metrics.csv", step_metrics, + delimiter=',') + return "" + + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"n_iterations": args.AdG_n_iter, + "n_stumps": args.AdG_stumps,} + return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"n_iterations": randomState.randint(1, 500),}) + return paramsSet diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py new file mode 100644 index 0000000000000000000000000000000000000000..08a8f5029defa32e8062cabce8df37e17d400dae --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py @@ -0,0 +1,105 @@ +from sklearn.ensemble import AdaBoostClassifier +from sklearn.tree import DecisionTreeClassifier +import numpy as np +import time +from sklearn.metrics import accuracy_score + +from ..Monoview.MonoviewUtils import CustomRandint, BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero +from ..Monoview.Additions.BoostUtils import get_accuracy_graph +from .. import Metrics +from ..Monoview.Additions.BoostUtils import get_accuracy_graph, StumpsClassifiersGenerator, BaseBoost + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +class AdaboostPregen(AdaBoostClassifier, BaseBoost, BaseMonoviewClassifier): + + def __init__(self, random_state=None, n_estimators=50, + base_estimator=None, n_stumps=10, self_complemeted=True , **kwargs): + super(AdaboostPregen, self).__init__( + random_state=random_state, + n_estimators=n_estimators, + base_estimator=base_estimator, + algorithm="SAMME" + ) + self.param_names = ["n_estimators", "base_estimator"] + self.classed_params = ["base_estimator"] + self.distribs = [CustomRandint(low=1, high=500), [DecisionTreeClassifier(max_depth=1)]] + self.weird_strings = {"base_estimator": "class_name"} + self.plotted_metric = Metrics.zero_one_loss + self.plotted_metric_name = "zero_one_loss" + self.step_predictions = None + self.estimators_generator = None + self.n_stumps=n_stumps + self.self_complemented=self_complemeted + + def fit(self, X, y, sample_weight=None): + begin = time.time() + pregen_X, pregen_y = self.pregen_voters(X, y) + super(AdaboostPregen, self).fit(pregen_X, pregen_y, sample_weight=sample_weight) + end = time.time() + self.train_time = end-begin + self.train_shape = pregen_X.shape + self.base_predictions = np.array([change_label_to_zero(estim.predict(pregen_X)) for estim in self.estimators_]) + self.metrics = np.array([self.plotted_metric.score(change_label_to_zero(pred), y) for pred in self.staged_predict(pregen_X)]) + self.bounds = np.array([np.prod(np.sqrt(1-4*np.square(0.5-self.estimator_errors_[:i+1]))) for i in range(self.estimator_errors_.shape[0])]) + + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def predict(self, X): + begin = time.time() + pregen_X, _ = self.pregen_voters(X) + pred = super(AdaboostPregen, self).predict(pregen_X) + end = time.time() + self.pred_time = end - begin + if pregen_X.shape != self.train_shape: + self.step_predictions = np.array([change_label_to_zero(step_pred) for step_pred in self.staged_predict(pregen_X)]) + return change_label_to_zero(pred) + + def getInterpret(self, directory, y_test): + interpretString = "" + interpretString += self.getFeatureImportance(directory) + interpretString += "\n\n Estimator error | Estimator weight\n" + interpretString += "\n".join([str(error) +" | "+ str(weight/sum(self.estimator_weights_)) for error, weight in zip(self.estimator_errors_, self.estimator_weights_)]) + step_test_metrics = np.array([self.plotted_metric.score(y_test, step_pred) for step_pred in self.step_predictions]) + get_accuracy_graph(step_test_metrics, "AdaboostPregen", directory + "test_metrics.png", + self.plotted_metric_name, set="test") + get_accuracy_graph(self.metrics, "AdaboostPregen", directory+"metrics.png", self.plotted_metric_name, bounds=list(self.bounds), bound_name="boosting bound") + np.savetxt(directory + "test_metrics.csv", step_test_metrics, delimiter=',') + np.savetxt(directory + "train_metrics.csv", self.metrics, delimiter=',') + np.savetxt(directory + "times.csv", np.array([self.train_time, self.pred_time]), delimiter=',') + return interpretString + + def pregen_voters(self, X, y=None): + if y is not None: + neg_y = change_label_to_minus(y) + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator( + n_stumps_per_attribute=self.n_stumps, + self_complemented=self.self_complemented) + self.estimators_generator.fit(X, neg_y) + else: + neg_y=None + classification_matrix = self._binary_classification_matrix(X) + return classification_matrix, neg_y + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {'n_estimators': args.AdP_n_est, + 'base_estimator': DecisionTreeClassifier(max_depth=1), + 'n_stumps':args.AdP_stumps} + return kwargsDict + + +def paramsToSet(nIter, random_state): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"n_estimators": random_state.randint(1, 500), + "base_estimator": None}) + return paramsSet diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py index b7dcd3cfcc56b6d9c64febbba1dbc7dfb56e3498..7edf2bf666916978bd778e94b17826cd9b99bbe6 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py @@ -527,7 +527,7 @@ class MinCQ(MinCqLearner, BaseMonoviewClassifier): def __init__(self, random_state=None, mu=0.01, epsilon=1e-06, **kwargs): super(MinCQ, self).__init__(mu=mu, voters_type='stumps', - n_stumps_per_attribute = 10 + n_stumps_per_attribute =10 ) self.param_names = ["mu"] self.distribs = [CustomUniform(loc=0.5, state=1.0, multiplier="e-"), @@ -556,7 +556,8 @@ class MinCQ(MinCqLearner, BaseMonoviewClassifier): def formatCmdArgs(args): """Used to format kwargs for the parsed args""" - kwargsDict = {"mu":args.MCQ_mu,} + kwargsDict = {"mu":args.MCQ_mu, + "n_stumps_per_attribute":args.MCQ_stumps} return kwargsDict diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index 55355de96984af69f10bf33831af5b018883cb37..43b2e1d8f4dbfe69187658913d810b2b2bd2ad19 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -122,6 +122,28 @@ def parseTheArgs(arguments): groupAdaboost.add_argument('--Ada_b_est', metavar='STRING', action='store', help='Estimators', default='DecisionTreeClassifier') + groupAdaboostPregen = parser.add_argument_group('AdaboostPregen arguments') + groupAdaboostPregen.add_argument('--AdP_n_est', metavar='INT', type=int, + action='store', help='Number of estimators', + default=2) + groupAdaboostPregen.add_argument('--AdP_b_est', metavar='STRING', action='store', + help='Estimators', + default='DecisionTreeClassifier') + groupAdaboostPregen.add_argument('--AdP_stumps', metavar='INT', type=int, + action='store', + help='Number of stumps inthe pregenerated dataset', + default=1) + + groupAdaboostGraalpy = parser.add_argument_group('AdaboostGraalpy arguments') + groupAdaboostGraalpy.add_argument('--AdG_n_iter', metavar='INT', type=int, + action='store', + help='Number of estimators', + default=2) + groupAdaboostGraalpy.add_argument('--AdG_stumps', metavar='INT', type=int, + action='store', + help='Number of stumps inthe pregenerated dataset', + default=1) + groupDT = parser.add_argument_group('Decision Trees arguments') groupDT.add_argument('--DT_depth', metavar='INT', type=int, action='store', help='Determine max depth for Decision Trees', default=3) @@ -222,6 +244,10 @@ def parseTheArgs(arguments): action='store', help='Set the mu_parameter for MinCQ', default=1e-3) + groupMinCQ.add_argument('--MCQ_stumps', metavar='INT', type=int, + action='store', + help='Set the n_stumps_per_attribute parameter for MinCQ', + default=1) groupQarBoostv3 = parser.add_argument_group('QarBoostv3 arguments') groupQarBoostv3.add_argument('--QarB3_mu', metavar='FLOAT', type=float, action='store',