diff --git a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py index c36e80a623c0a73e987b456691af40724acd5fff..6395dc74d88ffe0ddf49335057a26fc91d244f6b 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py +++ b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py @@ -173,7 +173,11 @@ def initMonoviewKWARGS(args, classifiersNames): logging.debug("Start:\t Initializing Monoview classifiers arguments") monoviewKWARGS = {} for classifiersName in classifiersNames: - classifierModule = getattr(MonoviewClassifiers, classifiersName) + try: + classifierModule = getattr(MonoviewClassifiers, classifiersName) + except AttributeError: + raise AttributeError(classifiersName+" is not implemented in MonoviewClassifiers, " + "please specify the name of the file in MonoviewClassifiers") monoviewKWARGS[classifiersName + "KWARGSInit"] = classifierModule.getKWARGS(args) logging.debug("Done:\t Initializing Monoview classifiers arguments") return monoviewKWARGS @@ -449,7 +453,7 @@ def execClassif(arguments): monoviewAlgos = args.CL_algos_monoview multiviewAlgos = args.CL_algos_multiview - directory = execution.initLogFile(args.name, args.views, args.CL_type, args.log) + directory = execution.initLogFile(args.name, args.views, args.CL_type, args.log, args.debug) randomState = execution.initRandomState(args.randomState, directory) statsIterRandomStates = execution.initStatsIterRandomStates(statsIter,randomState) diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/BoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/BoostUtils.py new file mode 100644 index 0000000000000000000000000000000000000000..af8396cdf4ec7a541e8e9d43940d669cf9b23275 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/BoostUtils.py @@ -0,0 +1,694 @@ +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.validation import check_is_fitted +import sys +import matplotlib.pyplot as plt +import datetime + + +class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): + """Generic Attribute Threshold Binary Classifier + + Attributes + ---------- + attribute_index : int + The attribute to consider for the classification. + threshold : float + The threshold value for classification rule. + direction : int, optional + A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump + will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). + + """ + def __init__(self, attribute_index, threshold, direction=1): + super(DecisionStumpClassifier, self).__init__() + self.attribute_index = attribute_index + self.threshold = threshold + self.direction = direction + + def fit(self, X, y): + # Only verify that we are in the binary classification setting, with support for transductive learning. + if isinstance(y, np.ma.MaskedArray): + self.classes_ = np.unique(y[np.logical_not(y.mask)]) + else: + self.classes_ = np.unique(y) + + # This label encoder is there for the predict function to be able to return any two classes that were used + # when fitting, for example {-1, 1} or {0, 1}. + self.le_ = LabelEncoder() + self.le_.fit(self.classes_) + self.classes_ = self.le_.classes_ + + assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" + return self + + def predict(self, X): + """Returns the output of the classifier, on a sample X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + predictions : array-like, shape = [n_samples] + Predicted class labels. + + """ + check_is_fitted(self, 'classes_') + return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) + + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + avg : array-like, shape = [n_samples, n_classes] + Weighted average probability for each class per sample. + + """ + check_is_fitted(self, 'classes_') + X = np.asarray(X) + probas = np.zeros((X.shape[0], 2)) + positive_class = np.argwhere(X[:, self.attribute_index] > self.threshold) + negative_class = np.setdiff1d(range(X.shape[0]), positive_class) + probas[positive_class, 1] = 1.0 + probas[negative_class, 0] = 1.0 + + if self.direction == -1: + probas = 1 - probas + + return probas + + def reverse_decision(self): + self.direction *= -1 + + +class ClassifiersGenerator(BaseEstimator, TransformerMixin): + """Base class to create a set of voters using training samples, and then transform a set of examples in + the voters' output space. + + Attributes + ---------- + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + voters : ndarray of voter functions + Once fit, contains the voter functions. + + """ + def __init__(self, self_complemented=False): + super(ClassifiersGenerator, self).__init__() + self.self_complemented = self_complemented + + def fit(self, X, y=None): + """Generates the voters using training samples. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Input labels, usually determines the decision polarity of each voter. + + Returns + ------- + self + + """ + raise NotImplementedError + + def transform(self, X): + """Transforms the input points in a matrix of classification, using previously learned voters. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data to classify. + + Returns + ------- + ndarray of shape (n_samples, n_voters) + The voters' decision on each example. + + """ + check_is_fitted(self, 'estimators_') + return np.array([voter.predict(X) for voter in self.estimators_]).T + +class StumpsClassifiersGenerator(ClassifiersGenerator): + """Decision Stump Voters transformer. + + Parameters + ---------- + n_stumps_per_attribute : int, optional + Determines how many decision stumps will be created for each attribute. Defaults to 10. + No stumps will be created for attributes with only one possible value. + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + + """ + def __init__(self, n_stumps_per_attribute=10, self_complemented=False): + super(StumpsClassifiersGenerator, self).__init__(self_complemented) + self.n_stumps_per_attribute = n_stumps_per_attribute + + def fit(self, X, y): + """Fits Decision Stump voters on a training set. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Only used to ensure that we are in the binary classification setting. + + Returns + ------- + self + + """ + minimums = np.min(X, axis=0) + maximums = np.max(X, axis=0) + ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) + + self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, 1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + if self.self_complemented: + self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + self.estimators_ = np.asarray(self.estimators_) + return self + +def _as_matrix(element): + """ Utility function to convert "anything" to a Numpy matrix. + """ + # If a scalar, return a 1x1 matrix. + if len(np.shape(element)) == 0: + return np.matrix([[element]], dtype=float) + + # If a nd-array vector, return a column matrix. + elif len(np.shape(element)) == 1: + matrix = np.matrix(element, dtype=float) + if np.shape(matrix)[1] != 1: + matrix = matrix.T + return matrix + + return np.matrix(element, dtype=float) + + +def _as_column_matrix(array_like): + """ Utility function to convert any array to a column Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[0] == 1: + matrix = matrix.T + + return matrix + + +def _as_line_matrix(array_like): + """ Utility function to convert any array to a line Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[1] == 1: + matrix = matrix.T + + return matrix + + + + + +def sign(array): + """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. + Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. + + Parameters + ---------- + array : array-like + Input values. + + Returns + ------- + ndarray + An array with the signs of input elements. + + """ + signs = np.sign(array) + + signs[array == 0] = -1 + return signs + + +class ConvexProgram(object): + """ + Encapsulates a quadratic program of the following form: + + minimize (1/2)*x'*P*x + q'*x + subject to G*x <= h + A*x = b. + + + or a linear program of the following form: + + minimize c'*x + subject to G*x <= h + A*x = b + """ + def __init__(self): + self._quadratic_func = None + self._linear_func = None + self._inequality_constraints_matrix = None + self._inequality_constraints_values = None + self._equality_constraints_matrix = None + self._equality_constraints_values = None + self._lower_bound_values = None + self._upper_bound_values = None + self._n_variables = None + + @property + def n_variables(self): + return self._n_variables + + @property + def quadratic_func(self): + return self._quadratic_func + + @quadratic_func.setter + def quadratic_func(self, quad_matrix): + quad_matrix = _as_matrix(quad_matrix) + n_lines, n_columns = np.shape(quad_matrix) + assert(n_lines == n_columns) + + if self._linear_func is not None: + assert(np.shape(quad_matrix)[0] == self._n_variables) + else: + self._n_variables = n_lines + + self._quadratic_func = quad_matrix + + @property + def linear_func(self): + return self._linear_func + + @linear_func.setter + def linear_func(self, lin_vector): + if lin_vector is not None: + lin_vector = _as_column_matrix(lin_vector) + + if self._quadratic_func is not None: + assert(np.shape(lin_vector)[0] == self._n_variables) + + else: + self._n_variables = np.shape(lin_vector)[0] + + self._linear_func = lin_vector + + def add_inequality_constraints(self, inequality_matrix, inequality_values): + if inequality_matrix is None: + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(inequality_matrix) or len(np.shape(inequality_matrix)) == 1: + inequality_matrix = _as_line_matrix(inequality_matrix) + else: + inequality_matrix = _as_matrix(inequality_matrix) + + inequality_values = _as_column_matrix(inequality_values) + assert np.shape(inequality_matrix)[1] == self._n_variables + assert np.shape(inequality_values)[1] == 1 + + if self._inequality_constraints_matrix is None: + self._inequality_constraints_matrix = inequality_matrix + else: + self._inequality_constraints_matrix = np.append(self._inequality_constraints_matrix, + inequality_matrix, axis=0) + + if self._inequality_constraints_values is None: + self._inequality_constraints_values = inequality_values + else: + self._inequality_constraints_values = np.append(self._inequality_constraints_values, + inequality_values, axis=0) + + def add_equality_constraints(self, equality_matrix, equality_values): + if equality_matrix is None: + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(equality_matrix) or len(np.shape(equality_matrix)) == 1: + equality_matrix = _as_line_matrix(equality_matrix) + else: + equality_matrix = _as_matrix(equality_matrix) + + equality_values = _as_matrix(equality_values) + assert np.shape(equality_matrix)[1] == self._n_variables + assert np.shape(equality_values)[1] == 1 + + if self._equality_constraints_matrix is None: + self._equality_constraints_matrix = equality_matrix + else: + self._equality_constraints_matrix = np.append(self._equality_constraints_matrix, + equality_matrix, axis=0) + + if self._equality_constraints_values is None: + self._equality_constraints_values = equality_values + else: + self._equality_constraints_values = np.append(self._equality_constraints_values, + equality_values, axis=0) + + def add_lower_bound(self, lower_bound): + if lower_bound is not None: + self._assert_objective_function_is_set() + self._lower_bound_values = np.array([lower_bound] * self._n_variables) + + def add_upper_bound(self, upper_bound): + if upper_bound is not None: + self._assert_objective_function_is_set() + self._upper_bound_values = np.array([upper_bound] * self._n_variables) + + def _convert_bounds_to_inequality_constraints(self): + self._assert_objective_function_is_set() + + if self._lower_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = -1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._lower_bound_values) + self._lower_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + if self._upper_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = 1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._upper_bound_values) + self._upper_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + def _convert_to_cvxopt_matrices(self): + from cvxopt import matrix as cvxopt_matrix + + if self._quadratic_func is not None: + self._quadratic_func = cvxopt_matrix(self._quadratic_func) + + if self._linear_func is not None: + self._linear_func = cvxopt_matrix(self._linear_func) + else: + # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! + self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) + + if self._inequality_constraints_matrix is not None: + self._inequality_constraints_matrix = cvxopt_matrix(self._inequality_constraints_matrix) + + if self._inequality_constraints_values is not None: + self._inequality_constraints_values = cvxopt_matrix(self._inequality_constraints_values) + + if self._equality_constraints_matrix is not None: + self._equality_constraints_matrix = cvxopt_matrix(self._equality_constraints_matrix) + + if self._equality_constraints_values is not None: + self._equality_constraints_values = cvxopt_matrix(self._equality_constraints_values) + + def _assert_objective_function_is_set(self): + assert self._n_variables is not None + + def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, return_all_information=False): + + # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. + original_stdout = sys.stdout + original_stderr = sys.stderr + + ret = None + + # TODO: Repair + # if solver == "cvxopt": + # stdout_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stdout_logger, logging.DEBUG) + # sys.stdout = sl + + # stderr_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stderr_logger, logging.WARNING) + # sys.stderr = sl + + try: + if solver == "cvxopt": + from cvxopt.solvers import qp, lp, options + options['feastol'] = feastol + options['abstol'] = abstol + options['reltol'] = reltol + options['show_progress'] = False + + self._convert_bounds_to_inequality_constraints() + self._convert_to_cvxopt_matrices() + + if self._quadratic_func is not None: + ret = qp(self.quadratic_func, self.linear_func, self._inequality_constraints_matrix, + self._inequality_constraints_values, self._equality_constraints_matrix, + self._equality_constraints_values) + + else: + ret = lp(self.linear_func, + G=self._inequality_constraints_matrix, + h=self._inequality_constraints_values, + A=self._equality_constraints_matrix, + b=self._equality_constraints_values) + + #logging.info("Primal objective value = {}".format(ret['primal objective'])) + #logging.info("Dual objective value = {}".format(ret['dual objective'])) + + if not return_all_information: + ret = np.asarray(np.array(ret['x']).T[0]) + + elif solver == "cplex": + import cplex + p = cplex.Cplex() + p.objective.set_sense(p.objective.sense.minimize) + + # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP + # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of + # them...), and the values (columns of the QP matrix). + names = [str(x) for x in range(self._n_variables)] + p.variables.add(names=names) + + if self.quadratic_func is not None: + p_matrix = [] + for line in self._quadratic_func: + p_matrix.append([names, line.tolist()[0]]) + + p.objective.set_quadratic(p_matrix) + + if self.linear_func is not None: + p.objective.set_linear(zip(names, + np.asarray(self.linear_func.T).reshape(self.n_variables,).tolist())) + + if self._inequality_constraints_matrix is not None: + inequality_linear = [] + for line in self._inequality_constraints_matrix: + inequality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=inequality_linear, + rhs=np.asarray(self._inequality_constraints_values.T).tolist()[0], + senses="L"*len(self._inequality_constraints_values)) + + if self._equality_constraints_matrix is not None: + equality_linear = [] + for line in self._equality_constraints_matrix: + equality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=equality_linear, + rhs=np.asarray(self._equality_constraints_values.T).tolist()[0], + senses="E"*len(self._equality_constraints_values)) + + if self._lower_bound_values is not None: + p.variables.set_lower_bounds(zip(names, self._lower_bound_values)) + + if self._upper_bound_values is not None: + p.variables.set_upper_bounds(zip(names, self._upper_bound_values)) + + p.solve() + + if not return_all_information: + ret = np.array(p.solution.get_values()) + else: + ret = {'primal': np.array(p.solution.get_values()), + 'dual': np.array(p.solution.get_dual_values())} + + elif solver == "pycpx": + # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). + + import pycpx + model = pycpx.CPlexModel(verbosity=2) + q = model.new(self.n_variables) + + if self._inequality_constraints_matrix is not None: + model.constrain(self._inequality_constraints_matrix * q <= self._inequality_constraints_values) + if self._equality_constraints_matrix is not None: + model.constrain(self._equality_constraints_matrix * q == self._equality_constraints_values) + if self._lower_bound_values is not None: + model.constrain(q >= self._lower_bound_values) + if self._upper_bound_values is not None: + model.constrain(q <= self._upper_bound_values) + + value = model.minimize(0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) + + if not return_all_information: + ret = np.array(model[q]) + else: + ret = model + + except: + raise + + finally: + sys.stdout = original_stdout + sys.stderr = original_stderr + + return ret + + def _as_matrix(element): + """ Utility function to convert "anything" to a Numpy matrix. + """ + # If a scalar, return a 1x1 matrix. + if len(np.shape(element)) == 0: + return np.matrix([[element]], dtype=float) + + # If a nd-array vector, return a column matrix. + elif len(np.shape(element)) == 1: + matrix = np.matrix(element, dtype=float) + if np.shape(matrix)[1] != 1: + matrix = matrix.T + return matrix + + return np.matrix(element, dtype=float) + + def _as_column_matrix(array_like): + """ Utility function to convert any array to a column Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[0] == 1: + matrix = matrix.T + + return matrix + + def _as_line_matrix(array_like): + """ Utility function to convert any array to a line Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[1] == 1: + matrix = matrix.T + + return matrix + + def sign(array): + """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. + Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. + + Parameters + ---------- + array : array-like + Input values. + + Returns + ------- + ndarray + An array with the signs of input elements. + + """ + signs = np.sign(array) + + signs[array == 0] = -1 + return signs + +def get_accuracy_graph(train_accuracies, classifier_name, file_name): + f, ax = plt.subplots(nrows=1, ncols=1) + ax.set_title("Accuracies during train for "+classifier_name) + x = np.arange(len(train_accuracies)) + scat = ax.scatter(x, np.array(train_accuracies), ) + ax.legend((scat,), ("Accuracies",)) + plt.tight_layout() + f.savefig(file_name) + plt.close() + +class BaseBoost(object): + + def __init__(self): + self.n_stumps = 10 + + def _collect_probas(self, X): + return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) + + def _binary_classification_matrix(self, X): + probas = self._collect_probas(X) + predicted_labels = np.argmax(probas, axis=2) + predicted_labels[predicted_labels == 0] = -1 + values = np.max(probas, axis=2) + return (predicted_labels * values).T + + def _initialize_alphas(self, n_examples): + raise NotImplementedError("Alpha weights initialization function is not implemented.") + + def check_opposed_voters(self, ): + nb_opposed = 0 + for column in self.classification_matrix[:, self.chosen_columns_].transpose(): + if -column.reshape((self.n_total_examples, 1)) in self.classification_matrix[:, self.chosen_columns_]: + nb_opposed+=1 + return int(nb_opposed/2) + + +def getInterpretBase(classifier, directory, classifier_name, weights, + break_cause=" the dual constrail was not violated"): + interpretString = "\t "+classifier_name+" permformed classification with weights : \n" + weights_sort = np.argsort(weights) + interpretString += np.array2string(weights[weights_sort], precision=4, separator=',', suppress_small=True) + interpretString += "\n \t It used {} iterations to converge, and selected {} couple(s) of opposed voters".format( + len(weights_sort), classifier.nb_opposed_voters) + if len(weights_sort) == classifier.n_max_iterations or len(weights) == classifier.n_total_hypotheses_: + if len(weights) == classifier.n_max_iterations: + interpretString += ", and used all available iterations, " + else: + interpretString += "." + if len(weights) == classifier.n_total_hypotheses_: + interpretString += ", and all the voters have been used." + else: + interpretString += "." + else: + interpretString += ", and the loop was broken because"+break_cause + interpretString += "\n\t Selected voters : \n" + interpretString += np.array2string(np.array(classifier.chosen_columns_)[weights_sort]) + interpretString += "\n\t Trained in "+str(datetime.timedelta(seconds=classifier.train_time))+" and predicted in "+str(datetime.timedelta(seconds=classifier.predict_time))+"." + interpretString += "\n\t Selected columns : \n" + interpretString += np.array2string(classifier.classification_matrix[:, classifier.chosen_columns_], precision=4, + separator=',', suppress_small=True) + np.savetxt(directory + "voters.csv", classifier.classification_matrix[:, classifier.chosen_columns_], delimiter=',') + np.savetxt(directory + "weights.csv", classifier.weights_, delimiter=',') + get_accuracy_graph(classifier.train_accuracies, classifier_name, directory + 'accuracies.png') + return interpretString diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py index 0d1ecb15ef01c801f883c7af4feae1bac8b8ab9b..9ef3a1cda15fe8374229596ab4a3dabebd87b67b 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py @@ -1,33 +1,22 @@ import scipy import logging -from future.utils import iteritems -from copy import deepcopy import numpy.ma as ma -from collections import defaultdict, OrderedDict -import pandas as pd -import sys -from functools import partial -import numpy as np -from scipy.spatial import distance -from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from collections import defaultdict from sklearn.utils.validation import check_is_fitted -from sklearn.preprocessing import LabelEncoder -from sklearn.tree import DecisionTreeClassifier -from sklearn.metrics.pairwise import rbf_kernel, linear_kernel -import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.pipeline import Pipeline -from sklearn.model_selection import RandomizedSearchCV -from sklearn.tree import DecisionTreeClassifier -from scipy.stats import randint, uniform +from sklearn.metrics import accuracy_score import numpy as np +import time +import datetime from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint +from ..Monoview.BoostUtils import StumpsClassifiersGenerator, ConvexProgram, sign, getInterpretBase, BaseBoost - -class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): +class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin, BaseBoost): def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None): + super(ColumnGenerationClassifier, self).__init__() self.epsilon = epsilon self.n_max_iterations = n_max_iterations self.estimators_generator = estimators_generator @@ -36,13 +25,12 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): def fit(self, X, y): if scipy.sparse.issparse(X): - # logging.info('Converting to dense matrix.') X = np.array(X.todense()) y[y == 0] = -1 if self.estimators_generator is None: - self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=10, self_complemented=True) + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=True) self.estimators_generator.fit(X, y) self.classification_matrix = self._binary_classification_matrix(X) @@ -50,14 +38,16 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): self.infos_per_iteration_ = defaultdict(list) m, n = self.classification_matrix.shape - # self.chosen_columns_ = [np.random.choice(np.arange(n)), np.random.choice(np.arange(n))] self.chosen_columns_ = [] self.n_total_hypotheses_ = n + self.n_total_examples = m y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) # Initialization alpha = self._initialize_alphas(m) + self.train_accuracies = [] + self.previous_votes = [] # w = [0.5,0.5] w= None self.collected_weight_vectors_ = {} @@ -68,7 +58,6 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): h_values = ma.array(np.squeeze(np.array((alpha).T.dot(y_kernel_matrix).T)), fill_value=-np.inf) h_values[self.chosen_columns_] = ma.masked worst_h_index = ma.argmax(h_values) - # logging.info("Adding voter {} to the columns, value = {}".format(worst_h_index, h_values[worst_h_index])) # Check for optimal solution. We ensure at least one complete iteration is done as the initialization # values might provide a degenerate initial solution. @@ -81,23 +70,20 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): # Solve restricted master for new costs. w, alpha = self._restricted_master_problem(y_kernel_matrix[:, self.chosen_columns_], previous_w=w, previous_alpha=alpha) + margins = np.squeeze(np.asarray(np.dot(self.classification_matrix[:, self.chosen_columns_], w))) + signs_array = np.array([int(x) for x in sign(margins)]) + self.train_accuracies.append(accuracy_score(y, signs_array)) - # We collect iteration information for later evaluation. - if self.save_iteration_as_hyperparameter_each is not None: - if (k + 1) % self.save_iteration_as_hyperparameter_each == 0: - self.collected_weight_vectors_[k] = deepcopy(w) - self.collected_dual_constraint_violations_[k] = h_values[worst_h_index] - self.dual_constraint_rhs - + self.nb_opposed_voters = self.check_opposed_voters() self.weights_ = w self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] - self.learner_info_ = {} - self.learner_info_.update(n_nonzero_weights=np.sum(np.asarray(self.weights_) > 1e-12)) - self.learner_info_.update(n_generated_columns=len(self.chosen_columns_)) + y[y == -1] = 0 return self def predict(self, X): + start = time.time() check_is_fitted(self, 'weights_') if scipy.sparse.issparse(X): @@ -109,6 +95,8 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): margins = np.squeeze(np.asarray(np.dot(classification_matrix, self.weights_))) signs_array = np.array([int(x) for x in sign(margins)]) signs_array[signs_array == -1] = 0 + end = time.time() + self.predict_time = end-start return signs_array def _binary_classification_matrix(self, X): @@ -127,42 +115,6 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): def _initialize_alphas(self, n_examples): raise NotImplementedError("Alpha weights initialization function is not implemented.") - def evaluate_metrics(self, X, y, metrics_list=None, functions_list=None): - if metrics_list is None: - metrics_list = [zero_one_loss, zero_one_loss_per_example] - - if functions_list is None: - functions_list = [] - - # Predict, evaluate metrics. - classification_matrix = self._binary_classification_matrix(X) - predictions = sign(classification_matrix.dot(self.weights_)) - - if self.save_iteration_as_hyperparameter_each is None: - metrics_results = {} - for metric in metrics_list: - metrics_results[metric.__name__] = metric(y, predictions) - - metrics_dataframe = ResultsDataFrame([metrics_results]) - return metrics_dataframe - - # If we collected iteration informations to add a hyperparameter, we add an index with the hyperparameter name - # and return a ResultsDataFrame containing one row per hyperparameter value. - metrics_dataframe = ResultsDataFrame() - for t, weights in iteritems(self.collected_weight_vectors_): - predictions = sign(classification_matrix[:, :t + 1].dot(weights)) - metrics_results = {metric.__name__: metric(y, predictions) for metric in metrics_list} - for function in functions_list: - metrics_results[function.__name__] = function(classification_matrix[:, :t + 1], y, weights) - - # We add other collected information. - metrics_results['chosen_columns'] = self.chosen_columns_[t] - metrics_results['dual_constraint_violation'] = self.collected_dual_constraint_violations_[t] - - metrics_dataframe = metrics_dataframe.append(ResultsDataFrame([metrics_results], index=[t])) - - metrics_dataframe.index.name = 'hp__n_iterations' - return metrics_dataframe class CqBoostClassifier(ColumnGenerationClassifier): def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None): @@ -171,6 +123,7 @@ class CqBoostClassifier(ColumnGenerationClassifier): # TODO: Vérifier la valeur de nu (dual_constraint_rhs) à l'initialisation, mais de toute manière ignorée car # on ne peut pas quitter la boucle principale avec seulement un votant. self.mu = mu + self.train_time = 0 def _restricted_master_problem(self, y_kernel_matrix, previous_w=None, previous_alpha=None): n_examples, n_hypotheses = y_kernel_matrix.shape @@ -295,21 +248,20 @@ class CQBoost(CqBoostClassifier): return interpretString - - - - def canProbas(): return False def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): """Used to fit the monoview classifier with the args stored in kwargs""" + start =time.time() classifier = CqBoostClassifier(mu=kwargs['mu'], epsilon=kwargs['epsilon'], n_max_iterations=kwargs["n_max_iterations"],) # random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time = end-start return classifier @@ -365,727 +317,4 @@ def getConfig(config): def getInterpret(classifier, directory): - dotted = False - interpretString = "\t CQBoost permformed classification with weights : \n" - interpretString += np.array2string(classifier.weights_, precision=4, separator=',', suppress_small=True) - interpretString += "\n \t It used {} iterations to converge".format(len(classifier.weights_)) - if len(classifier.weights_) == classifier.n_max_iterations: - interpretString += ", and used all available iterations, " - else: - dotted = True - interpretString += "." - if len(classifier.weights_) == classifier.n_total_hypotheses_: - interpretString += ", and all the voters have been used." - elif not dotted: - interpretString += "." - interpretString += "\n\t Selected voters : \n" - interpretString += str(classifier.chosen_columns_) - interpretString += "\n\t and they voted : \n" - interpretString += np.array2string(classifier.classification_matrix[:, classifier.chosen_columns_], precision=4, separator=',', suppress_small=True) - np.savetxt(directory+"voters.csv", classifier.classification_matrix[:, classifier.chosen_columns_], delimiter=',') - np.savetxt(directory + "weights.csv", classifier.weights_, delimiter=',') - return interpretString - - - - - -def _as_matrix(element): - """ Utility function to convert "anything" to a Numpy matrix. - """ - # If a scalar, return a 1x1 matrix. - if len(np.shape(element)) == 0: - return np.matrix([[element]], dtype=float) - - # If a nd-array vector, return a column matrix. - elif len(np.shape(element)) == 1: - matrix = np.matrix(element, dtype=float) - if np.shape(matrix)[1] != 1: - matrix = matrix.T - return matrix - - return np.matrix(element, dtype=float) - - -def _as_column_matrix(array_like): - """ Utility function to convert any array to a column Numpy matrix. - """ - matrix = _as_matrix(array_like) - if 1 not in np.shape(matrix): - raise ValueError("_as_column_vector: input must be a vector") - - if np.shape(matrix)[0] == 1: - matrix = matrix.T - - return matrix - - -def _as_line_matrix(array_like): - """ Utility function to convert any array to a line Numpy matrix. - """ - matrix = _as_matrix(array_like) - if 1 not in np.shape(matrix): - raise ValueError("_as_column_vector: input must be a vector") - - if np.shape(matrix)[1] == 1: - matrix = matrix.T - - return matrix - - -class ConvexProgram(object): - """ - Encapsulates a quadratic program of the following form: - - minimize (1/2)*x'*P*x + q'*x - subject to G*x <= h - A*x = b. - - - or a linear program of the following form: - - minimize c'*x - subject to G*x <= h - A*x = b - """ - def __init__(self): - self._quadratic_func = None - self._linear_func = None - self._inequality_constraints_matrix = None - self._inequality_constraints_values = None - self._equality_constraints_matrix = None - self._equality_constraints_values = None - self._lower_bound_values = None - self._upper_bound_values = None - self._n_variables = None - - @property - def n_variables(self): - return self._n_variables - - @property - def quadratic_func(self): - return self._quadratic_func - - @quadratic_func.setter - def quadratic_func(self, quad_matrix): - quad_matrix = _as_matrix(quad_matrix) - n_lines, n_columns = np.shape(quad_matrix) - assert(n_lines == n_columns) - - if self._linear_func is not None: - assert(np.shape(quad_matrix)[0] == self._n_variables) - else: - self._n_variables = n_lines - - self._quadratic_func = quad_matrix - - @property - def linear_func(self): - return self._linear_func - - @linear_func.setter - def linear_func(self, lin_vector): - if lin_vector is not None: - lin_vector = _as_column_matrix(lin_vector) - - if self._quadratic_func is not None: - assert(np.shape(lin_vector)[0] == self._n_variables) - - else: - self._n_variables = np.shape(lin_vector)[0] - - self._linear_func = lin_vector - - def add_inequality_constraints(self, inequality_matrix, inequality_values): - if inequality_matrix is None: - logging.info("Empty inequality constraint: ignoring!") - return - - self._assert_objective_function_is_set() - - if 1 in np.shape(inequality_matrix) or len(np.shape(inequality_matrix)) == 1: - inequality_matrix = _as_line_matrix(inequality_matrix) - else: - inequality_matrix = _as_matrix(inequality_matrix) - - inequality_values = _as_column_matrix(inequality_values) - assert np.shape(inequality_matrix)[1] == self._n_variables - assert np.shape(inequality_values)[1] == 1 - - if self._inequality_constraints_matrix is None: - self._inequality_constraints_matrix = inequality_matrix - else: - self._inequality_constraints_matrix = np.append(self._inequality_constraints_matrix, - inequality_matrix, axis=0) - - if self._inequality_constraints_values is None: - self._inequality_constraints_values = inequality_values - else: - self._inequality_constraints_values = np.append(self._inequality_constraints_values, - inequality_values, axis=0) - - def add_equality_constraints(self, equality_matrix, equality_values): - if equality_matrix is None: - logging.info("Empty equality constraint: ignoring!") - return - - self._assert_objective_function_is_set() - - if 1 in np.shape(equality_matrix) or len(np.shape(equality_matrix)) == 1: - equality_matrix = _as_line_matrix(equality_matrix) - else: - equality_matrix = _as_matrix(equality_matrix) - - equality_values = _as_matrix(equality_values) - assert np.shape(equality_matrix)[1] == self._n_variables - assert np.shape(equality_values)[1] == 1 - - if self._equality_constraints_matrix is None: - self._equality_constraints_matrix = equality_matrix - else: - self._equality_constraints_matrix = np.append(self._equality_constraints_matrix, - equality_matrix, axis=0) - - if self._equality_constraints_values is None: - self._equality_constraints_values = equality_values - else: - self._equality_constraints_values = np.append(self._equality_constraints_values, - equality_values, axis=0) - - def add_lower_bound(self, lower_bound): - if lower_bound is not None: - self._assert_objective_function_is_set() - self._lower_bound_values = np.array([lower_bound] * self._n_variables) - - def add_upper_bound(self, upper_bound): - if upper_bound is not None: - self._assert_objective_function_is_set() - self._upper_bound_values = np.array([upper_bound] * self._n_variables) - - def _convert_bounds_to_inequality_constraints(self): - self._assert_objective_function_is_set() - - if self._lower_bound_values is not None: - c_matrix = [] - for i in range(self._n_variables): - c_line = [0] * self._n_variables - c_line[i] = -1.0 - c_matrix.append(c_line) - - c_vector = _as_column_matrix(self._lower_bound_values) - self._lower_bound_values = None - self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) - - if self._upper_bound_values is not None: - c_matrix = [] - for i in range(self._n_variables): - c_line = [0] * self._n_variables - c_line[i] = 1.0 - c_matrix.append(c_line) - - c_vector = _as_column_matrix(self._upper_bound_values) - self._upper_bound_values = None - self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) - - def _convert_to_cvxopt_matrices(self): - from cvxopt import matrix as cvxopt_matrix - - if self._quadratic_func is not None: - self._quadratic_func = cvxopt_matrix(self._quadratic_func) - - if self._linear_func is not None: - self._linear_func = cvxopt_matrix(self._linear_func) - else: - # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! - self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) - - if self._inequality_constraints_matrix is not None: - self._inequality_constraints_matrix = cvxopt_matrix(self._inequality_constraints_matrix) - - if self._inequality_constraints_values is not None: - self._inequality_constraints_values = cvxopt_matrix(self._inequality_constraints_values) - - if self._equality_constraints_matrix is not None: - self._equality_constraints_matrix = cvxopt_matrix(self._equality_constraints_matrix) - - if self._equality_constraints_values is not None: - self._equality_constraints_values = cvxopt_matrix(self._equality_constraints_values) - - def _assert_objective_function_is_set(self): - assert self._n_variables is not None - - def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, return_all_information=False): - - # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. - original_stdout = sys.stdout - original_stderr = sys.stderr - - ret = None - - # TODO: Repair - # if solver == "cvxopt": - # stdout_logger = logging.getLogger('CVXOPT') - # sl = StreamToLogger(stdout_logger, logging.DEBUG) - # sys.stdout = sl - - # stderr_logger = logging.getLogger('CVXOPT') - # sl = StreamToLogger(stderr_logger, logging.WARNING) - # sys.stderr = sl - - try: - if solver == "cvxopt": - from cvxopt.solvers import qp, lp, options - options['feastol'] = feastol - options['abstol'] = abstol - options['reltol'] = reltol - options['show_progress'] = False - - self._convert_bounds_to_inequality_constraints() - self._convert_to_cvxopt_matrices() - - if self._quadratic_func is not None: - ret = qp(self.quadratic_func, self.linear_func, self._inequality_constraints_matrix, - self._inequality_constraints_values, self._equality_constraints_matrix, - self._equality_constraints_values) - - else: - ret = lp(self.linear_func, - G=self._inequality_constraints_matrix, - h=self._inequality_constraints_values, - A=self._equality_constraints_matrix, - b=self._equality_constraints_values) - - # logging.info("Primal objective value = {}".format(ret['primal objective'])) - # logging.info("Dual objective value = {}".format(ret['dual objective'])) - - if not return_all_information: - ret = np.asarray(np.array(ret['x']).T[0]) - - elif solver == "cplex": - import cplex - p = cplex.Cplex() - p.objective.set_sense(p.objective.sense.minimize) - - # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP - # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of - # them...), and the values (columns of the QP matrix). - names = [str(x) for x in range(self._n_variables)] - p.variables.add(names=names) - - if self.quadratic_func is not None: - p_matrix = [] - for line in self._quadratic_func: - p_matrix.append([names, line.tolist()[0]]) - - p.objective.set_quadratic(p_matrix) - - if self.linear_func is not None: - p.objective.set_linear(zip(names, - np.asarray(self.linear_func.T).reshape(self.n_variables,).tolist())) - - if self._inequality_constraints_matrix is not None: - inequality_linear = [] - for line in self._inequality_constraints_matrix: - inequality_linear.append([names, line.tolist()[0]]) - p.linear_constraints.add(lin_expr=inequality_linear, - rhs=np.asarray(self._inequality_constraints_values.T).tolist()[0], - senses="L"*len(self._inequality_constraints_values)) - - if self._equality_constraints_matrix is not None: - equality_linear = [] - for line in self._equality_constraints_matrix: - equality_linear.append([names, line.tolist()[0]]) - p.linear_constraints.add(lin_expr=equality_linear, - rhs=np.asarray(self._equality_constraints_values.T).tolist()[0], - senses="E"*len(self._equality_constraints_values)) - - if self._lower_bound_values is not None: - p.variables.set_lower_bounds(zip(names, self._lower_bound_values)) - - if self._upper_bound_values is not None: - p.variables.set_upper_bounds(zip(names, self._upper_bound_values)) - - p.solve() - - logging.info("Solution status = {} : {}".format(p.solution.get_status(), - p.solution.status[p.solution.get_status()])) - logging.info("Solution value = {}".format(p.solution.get_objective_value())) - - if not return_all_information: - ret = np.array(p.solution.get_values()) - else: - ret = {'primal': np.array(p.solution.get_values()), - 'dual': np.array(p.solution.get_dual_values())} - - elif solver == "pycpx": - # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). - - import pycpx - model = pycpx.CPlexModel(verbosity=2) - q = model.new(self.n_variables) - - if self._inequality_constraints_matrix is not None: - model.constrain(self._inequality_constraints_matrix * q <= self._inequality_constraints_values) - if self._equality_constraints_matrix is not None: - model.constrain(self._equality_constraints_matrix * q == self._equality_constraints_values) - if self._lower_bound_values is not None: - model.constrain(q >= self._lower_bound_values) - if self._upper_bound_values is not None: - model.constrain(q <= self._upper_bound_values) - - value = model.minimize(0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) - - logging.info("Solution value = {}".format(value)) - - if not return_all_information: - ret = np.array(model[q]) - else: - ret = model - - except: - raise - - finally: - sys.stdout = original_stdout - sys.stderr = original_stderr - - return ret - - - - - - -class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): - """Generic Attribute Threshold Binary Classifier - - Attributes - ---------- - attribute_index : int - The attribute to consider for the classification. - threshold : float - The threshold value for classification rule. - direction : int, optional - A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump - will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). - - """ - def __init__(self, attribute_index, threshold, direction=1): - super(DecisionStumpClassifier, self).__init__() - self.attribute_index = attribute_index - self.threshold = threshold - self.direction = direction - - def fit(self, X, y): - # Only verify that we are in the binary classification setting, with support for transductive learning. - if isinstance(y, np.ma.MaskedArray): - self.classes_ = np.unique(y[np.logical_not(y.mask)]) - else: - self.classes_ = np.unique(y) - - # This label encoder is there for the predict function to be able to return any two classes that were used - # when fitting, for example {-1, 1} or {0, 1}. - self.le_ = LabelEncoder() - self.le_.fit(self.classes_) - self.classes_ = self.le_.classes_ - - assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" - return self - - def predict(self, X): - """Returns the output of the classifier, on a sample X. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of features. - - Returns - ------- - predictions : array-like, shape = [n_samples] - Predicted class labels. - - """ - check_is_fitted(self, 'classes_') - return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) - - def predict_proba(self, X): - """Compute probabilities of possible outcomes for samples in X. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of features. - - Returns - ------- - avg : array-like, shape = [n_samples, n_classes] - Weighted average probability for each class per sample. - - """ - check_is_fitted(self, 'classes_') - X = np.asarray(X) - probas = np.zeros((X.shape[0], 2)) - positive_class = np.argwhere(X[:, self.attribute_index] > self.threshold) - negative_class = np.setdiff1d(range(X.shape[0]), positive_class) - probas[positive_class, 1] = 1.0 - probas[negative_class, 0] = 1.0 - - if self.direction == -1: - probas = 1 - probas - - return probas - - def reverse_decision(self): - self.direction *= -1 - - -class ClassifiersGenerator(BaseEstimator, TransformerMixin): - """Base class to create a set of voters using training samples, and then transform a set of examples in - the voters' output space. - - Attributes - ---------- - self_complemented : bool, optional - Whether or not a binary complement voter must be generated for each voter. Defaults to False. - voters : ndarray of voter functions - Once fit, contains the voter functions. - - """ - def __init__(self, self_complemented=False): - super(ClassifiersGenerator, self).__init__() - self.self_complemented = self_complemented - - def fit(self, X, y=None): - """Generates the voters using training samples. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data on which to base the voters. - y : ndarray of shape (n_labeled_samples,), optional - Input labels, usually determines the decision polarity of each voter. - - Returns - ------- - self - - """ - raise NotImplementedError - - def transform(self, X): - """Transforms the input points in a matrix of classification, using previously learned voters. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data to classify. - - Returns - ------- - ndarray of shape (n_samples, n_voters) - The voters' decision on each example. - - """ - check_is_fitted(self, 'estimators_') - return np.array([voter.predict(X) for voter in self.estimators_]).T - -class StumpsClassifiersGenerator(ClassifiersGenerator): - """Decision Stump Voters transformer. - - Parameters - ---------- - n_stumps_per_attribute : int, optional - Determines how many decision stumps will be created for each attribute. Defaults to 10. - No stumps will be created for attributes with only one possible value. - self_complemented : bool, optional - Whether or not a binary complement voter must be generated for each voter. Defaults to False. - - """ - def __init__(self, n_stumps_per_attribute=10, self_complemented=False): - super(StumpsClassifiersGenerator, self).__init__(self_complemented) - self.n_stumps_per_attribute = n_stumps_per_attribute - - def fit(self, X, y): - """Fits Decision Stump voters on a training set. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data on which to base the voters. - y : ndarray of shape (n_labeled_samples,), optional - Only used to ensure that we are in the binary classification setting. - - Returns - ------- - self - - """ - minimums = np.min(X, axis=0) - maximums = np.max(X, axis=0) - ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) - - self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, 1).fit(X, y) - for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) - if ranges[i] != 0] - - if self.self_complemented: - self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) - for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) - if ranges[i] != 0] - - self.estimators_ = np.asarray(self.estimators_) - return self - -def sign(array): - """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. - Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. - - Parameters - ---------- - array : array-like - Input values. - - Returns - ------- - ndarray - An array with the signs of input elements. - - """ - signs = np.sign(array) - - signs[array == 0] = -1 - return signs - - -def zero_one_loss(y_target, y_estimate, confidences=1): - if len(y_target) == 0: - return 0.0 - return np.mean(y_target != y_estimate) - - -def zero_one_loss_per_example(y_target, y_estimate, confidences=1): - if len(y_target) == 0: - return 0.0 - return (y_target != y_estimate).astype(np.int) - - -class ResultsDataFrame(pd.DataFrame): - """A ResultsDataFrame is a DataFrame with the following information: - - - A 'dataset' column that contains the dataset name - - Hyperparamer columns, named 'hp__HPNAME', where HPNAME is the name of the hyperparameter - - Columns containing informations about that depend on the dataset and hyperparameters, for example the risk. - - """ - @property - def datasets_list(self): - """Returns the sorted list of datasets. - - """ - return sorted(set(self['dataset'])) - - @property - def hyperparameters_list(self): - """Returns a sorted list of hyperparameter names, without the 'hp__' prefix. - - """ - return sorted(column.split('hp__')[1] for column in self.columns if column.startswith('hp__')) - - @property - def hyperparameters_list_with_prefix(self): - return sorted(column for column in self.columns if column.startswith('hp__')) - - @property - def metrics_list(self): - return sorted(column for column in self.columns if not column.startswith('hp__') and column != 'dataset') - - @property - def hyperparameters_with_values(self): - """Returns a dictionary that contains the hyperparameter names (without the 'hp__' prefix), and - associated values that are present in the DataFrame. - - """ - hyperparameters = [column for column in self.columns if column.startswith('hp__')] - - hyperparameters_dict = {} - tmp_dict = self[hyperparameters].to_dict() - - for key, value in iteritems(tmp_dict): - hyperparameters_dict[key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(set(value.values())) - - return hyperparameters_dict - - @property - def hyperparameters_with_values_per_dataset(self): - """Returns a dictionary of dictionaries that contains for each dataset, the hyperparameter names (without the - 'hp__' prefix), and associated values that are present in the DataFrame. - - """ - hyperparameters = [column for column in self.columns if column.startswith('hp__')] - - hyperparameters_dict = {} - for dataset in self.datasets_list: - tmp_dict = self[self.dataset == dataset][hyperparameters].to_dict() - hyperparameters_dict[dataset] = {} - - for key, value in iteritems(tmp_dict): - hyperparameters_dict[dataset][key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(value.values()) - - return hyperparameters_dict - - def results_optimizing_metric(self, metric_to_optimize='cv_mean__valid__zero_one_loss', minimize=True, tie_breaking_functions_ordered_dict=None): - function = min if minimize else max - - # We extract all the rows that have the best value for the metric to optimize. - optimal_results = self[self.groupby('dataset', sort=False)[metric_to_optimize].transform(function) == self[metric_to_optimize]] - - # We tie the breaks by applying the tie breaking functions (in the order of the dictionary). If hyperparameters are missing, we simply - # use the median for each hyperparameter, in a fixed (reproduceable) order. - if tie_breaking_functions_ordered_dict is None: - tie_breaking_functions_ordered_dict = OrderedDict() - else: - # Avoid side effects and ensures that the dictionary is an OrderedDict before we add missing hyperparameters. - tie_breaking_functions_ordered_dict = OrderedDict(tie_breaking_functions_ordered_dict.copy()) - - for hyperparameter in sorted(self.hyperparameters_list): - if hyperparameter not in tie_breaking_functions_ordered_dict.keys(): - tie_breaking_functions_ordered_dict[hyperparameter] = np.median - - for hyperparameter, tie_breaking_function in iteritems(tie_breaking_functions_ordered_dict): - optimal_results = optimal_results[optimal_results.groupby('dataset')['hp__' + hyperparameter].transform(partial(get_optimal_value_in_list, tie_breaking_function)) == optimal_results['hp__' + hyperparameter]] - - return ResultsDataFrame(optimal_results) - - def get_dataframe_with_metrics_as_one_column(self, metrics_to_keep=None): - new_dataframe = ResultsDataFrame() - - if metrics_to_keep is None: - metrics_to_keep = self.metrics_list - - for metric in metrics_to_keep: - columns = self.hyperparameters_list_with_prefix + [metric] - if 'dataset' in self: - columns.append('dataset') - - tmp = self.loc[:, columns] - tmp.columns = [c if c != metric else 'value' for c in tmp.columns] - tmp.loc[:, 'metric'] = metric - new_dataframe = new_dataframe.append(tmp, ignore_index=True) - - return new_dataframe - - -def get_optimal_value_in_list(optimum_function, values_list): - """Given a list of values and an optimal value, returns the value from the list that is the closest to the optimum, - given by optimum_function applied to the same list. - - >>> get_optimal_value_in_list(np.median, [2, 4, 5, 6]) - 4 - - """ - values_list = sorted(list(values_list)) - return values_list[np.argmin(np.array([scipy.spatial.distance.euclidean(value, optimum_function(values_list)) for value in values_list]))] + return getInterpretBase(classifier, directory, "CQBoostv21", classifier.weights_) diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv2.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv2.py index 839cd7c60ad96730c6a2857513d3f871a7316a68..211bff9cf7b34d571e22fbb7829b2c24fdc1ef92 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv2.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv2.py @@ -1,32 +1,23 @@ import scipy import logging -from future.utils import iteritems from copy import deepcopy import numpy.ma as ma -from collections import defaultdict, OrderedDict -import pandas as pd -import sys -from functools import partial -from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from collections import defaultdict from sklearn.utils.validation import check_is_fitted -from sklearn.preprocessing import LabelEncoder from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score import numpy as np - -# import numpy as np -# from scipy.spatial import distance -# from sklearn.tree import DecisionTreeClassifier -# from sklearn.metrics.pairwise import rbf_kernel, linear_kernel -# from sklearn.model_selection import RandomizedSearchCV -# from sklearn.tree import DecisionTreeClassifier -# from scipy.stats import randint - +import time +import datetime from ..Monoview.MonoviewUtils import CustomRandint, CustomUniform +from ..Monoview.BoostUtils import StumpsClassifiersGenerator, ConvexProgram, sign, getInterpretBase, BaseBoost + -class ColumnGenerationClassifierv2(BaseEstimator, ClassifierMixin): +class ColumnGenerationClassifierv2(BaseEstimator, ClassifierMixin, BaseBoost): def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None): + super(ColumnGenerationClassifierv2, self).__init__() self.epsilon = epsilon self.n_max_iterations = n_max_iterations self.estimators_generator = estimators_generator @@ -39,7 +30,7 @@ class ColumnGenerationClassifierv2(BaseEstimator, ClassifierMixin): X = np.array(X.todense()) if self.estimators_generator is None: - self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=10, self_complemented=True) + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=True) y[y == 0] = -1 @@ -64,19 +55,18 @@ class ColumnGenerationClassifierv2(BaseEstimator, ClassifierMixin): self.chosen_columns_ = [] self.edge_scores = [] self.alphas = [] + self.train_accuracies = [] + self.previous_votes = [] self.n_total_hypotheses_ = n + self.n_total_examples = m for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): # Find worst weak hypothesis given alpha. h_values = ma.array(np.squeeze(np.array((alpha).T.dot(y_kernel_matrix).T)), fill_value=-np.inf) h_values[self.chosen_columns_] = ma.masked worst_h_index = ma.argmax(h_values) - #logging.info("Adding voter {} to the columns, value = {}".format(worst_h_index, h_values[worst_h_index])) - # Check for optimal solution. We ensure at least one complete iteration is done as the initialization - # values might provide a degenerate initial solution. - # print(h_values[worst_h_index] - self.dual_constraint_rhs) if h_values[worst_h_index] <= self.dual_constraint_rhs + self.epsilon and len(self.chosen_columns_) > 0: break @@ -100,21 +90,25 @@ class ColumnGenerationClassifierv2(BaseEstimator, ClassifierMixin): # We collect iteration information for later evaluation. self.weights_.append(w[-1]) - if self.save_iteration_as_hyperparameter_each is not None: - if (k + 1) % self.save_iteration_as_hyperparameter_each == 0: - self.collected_weight_vectors_[k] = deepcopy(w) - self.collected_dual_constraint_violations_[k] = h_values[worst_h_index] - self.dual_constraint_rhs - + self.weights = np.array(self.weights_) + self.final_vote_weights = np.array([np.prod(1 - self.weights[t + 1:]) * self.weights_[t] if t < + self.weights.shape[ + 0] - 1 else + self.weights[t] for t in range(self.weights.shape[0])]) + margins = np.squeeze(np.asarray(np.matmul(self.classification_matrix[:,self.chosen_columns_], + self.final_vote_weights))) + signs_array = np.array([int(x) for x in sign(margins)]) + self.train_accuracies.append(accuracy_score(y, signs_array)) + + self.nb_opposed_voters = self.check_opposed_voters() self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] - self.learner_info_ = {} - self.learner_info_.update(n_nonzero_weights=np.sum(np.asarray(self.weights_) > 1e-12)) - self.learner_info_.update(n_generated_columns=len(self.chosen_columns_)) y[y == -1] = 0 return self def predict(self, X): + start = time.time() check_is_fitted(self, 'weights_') if scipy.sparse.issparse(X): @@ -123,65 +117,18 @@ class ColumnGenerationClassifierv2(BaseEstimator, ClassifierMixin): classification_matrix = self._binary_classification_matrix(X) self.weights_ = np.array(self.weights_) - # final_vote_weights = np.array(self.weights_) / np.sum(np.array(self.weights_)) self.final_vote_weights = np.array([np.prod(1-self.weights_[t+1:])*self.weights_[t] if t < self.weights_.shape[0]-1 else self.weights_[t] for t in range(self.weights_.shape[0]) ]) margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.final_vote_weights))) signs_array = np.array([int(x) for x in sign(margins)]) signs_array[signs_array == -1] = 0 + end = time.time() + self.predict_time = end-start return signs_array - def _binary_classification_matrix(self, X): - probas = self._collect_probas(X) - predicted_labels = np.argmax(probas, axis=2) - predicted_labels[predicted_labels == 0] = -1 - values = np.max(probas, axis=2) - return (predicted_labels * values).T - - def _collect_probas(self, X): - return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) - def _restricted_master_problem(self, y_kernel_matrix): raise NotImplementedError("Restricted master problem not implemented.") - def _initialize_alphas(self, n_examples): - raise NotImplementedError("Alpha weights initialization function is not implemented.") - - def evaluate_metrics(self, X, y, metrics_list=None, functions_list=None): - if metrics_list is None: - metrics_list = [zero_one_loss, zero_one_loss_per_example] - if functions_list is None: - functions_list = [] - - # Predict, evaluate metrics. - classification_matrix = self._binary_classification_matrix(X) - predictions = sign(classification_matrix.dot(self.weights_)) - - if self.save_iteration_as_hyperparameter_each is None: - metrics_results = {} - for metric in metrics_list: - metrics_results[metric.__name__] = metric(y, predictions) - - metrics_dataframe = ResultsDataFrame([metrics_results]) - return metrics_dataframe - - # If we collected iteration informations to add a hyperparameter, we add an index with the hyperparameter name - # and return a ResultsDataFrame containing one row per hyperparameter value. - metrics_dataframe = ResultsDataFrame() - for t, weights in iteritems(self.collected_weight_vectors_): - predictions = sign(classification_matrix[:, :t + 1].dot(weights)) - metrics_results = {metric.__name__: metric(y, predictions) for metric in metrics_list} - for function in functions_list: - metrics_results[function.__name__] = function(classification_matrix[:, :t + 1], y, weights) - - # We add other collected information. - metrics_results['chosen_columns'] = self.chosen_columns_[t] - metrics_results['dual_constraint_violation'] = self.collected_dual_constraint_violations_[t] - - metrics_dataframe = metrics_dataframe.append(ResultsDataFrame([metrics_results], index=[t])) - - metrics_dataframe.index.name = 'hp__n_iterations' - return metrics_dataframe class CqBoostClassifierv2(ColumnGenerationClassifierv2): def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None): @@ -190,6 +137,7 @@ class CqBoostClassifierv2(ColumnGenerationClassifierv2): # TODO: Vérifier la valeur de nu (dual_constraint_rhs) à l'initialisation, mais de toute manière ignorée car # on ne peut pas quitter la boucle principale avec seulement un votant. self.mu = mu + self.train_time = 0 def _restricted_master_problem(self, y_kernel_matrix, previous_w=None, previous_alpha=None): n_examples, n_hypotheses = y_kernel_matrix.shape @@ -320,11 +268,14 @@ def canProbas(): def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): """Used to fit the monoview classifier with the args stored in kwargs""" + start = time.time() classifier = CqBoostClassifierv2(mu=kwargs['mu'], epsilon=kwargs['epsilon'], n_max_iterations=kwargs["n_max_iterations"],) # random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time =end-start return classifier @@ -380,726 +331,5 @@ def getConfig(config): def getInterpret(classifier, directory): - interpretString = "\t CQBoost v2 permformed classification with weights : \n" - interpretString += np.array2string(classifier.final_vote_weights, precision=4, separator=',', suppress_small=True) - interpretString += "\n \t It used {} iterations to converge, ".format(len(classifier.final_vote_weights)) - if len(classifier.final_vote_weights) == classifier.n_max_iterations: - interpretString += ", and used all available iterations, " - else: - interpretString += "." - if len(classifier.final_vote_weights) == classifier.n_total_hypotheses_: - interpretString += ", and all the voters have been used." - else: - interpretString += "." - interpretString += "\n\t Selected voters : \n" - interpretString += str(classifier.chosen_columns_) - interpretString += "\n\t and they voted : \n" - interpretString += np.array2string(classifier.classification_matrix[:,classifier.chosen_columns_], precision=4, - separator=',', suppress_small=True) - np.savetxt(directory + "voters.csv", classifier.classification_matrix[:,classifier.chosen_columns_], delimiter=',') - np.savetxt(directory + "weights.csv", classifier.final_vote_weights, delimiter=',') - return interpretString - - - - - -def _as_matrix(element): - """ Utility function to convert "anything" to a Numpy matrix. - """ - # If a scalar, return a 1x1 matrix. - if len(np.shape(element)) == 0: - return np.matrix([[element]], dtype=float) - - # If a nd-array vector, return a column matrix. - elif len(np.shape(element)) == 1: - matrix = np.matrix(element, dtype=float) - if np.shape(matrix)[1] != 1: - matrix = matrix.T - return matrix - - return np.matrix(element, dtype=float) - - -def _as_column_matrix(array_like): - """ Utility function to convert any array to a column Numpy matrix. - """ - matrix = _as_matrix(array_like) - if 1 not in np.shape(matrix): - raise ValueError("_as_column_vector: input must be a vector") - - if np.shape(matrix)[0] == 1: - matrix = matrix.T - - return matrix - - -def _as_line_matrix(array_like): - """ Utility function to convert any array to a line Numpy matrix. - """ - matrix = _as_matrix(array_like) - if 1 not in np.shape(matrix): - raise ValueError("_as_column_vector: input must be a vector") - - if np.shape(matrix)[1] == 1: - matrix = matrix.T - - return matrix - - -class ConvexProgram(object): - """ - Encapsulates a quadratic program of the following form: - - minimize (1/2)*x'*P*x + q'*x - subject to G*x <= h - A*x = b. - - - or a linear program of the following form: - - minimize c'*x - subject to G*x <= h - A*x = b - """ - def __init__(self): - self._quadratic_func = None - self._linear_func = None - self._inequality_constraints_matrix = None - self._inequality_constraints_values = None - self._equality_constraints_matrix = None - self._equality_constraints_values = None - self._lower_bound_values = None - self._upper_bound_values = None - self._n_variables = None - - @property - def n_variables(self): - return self._n_variables - - @property - def quadratic_func(self): - return self._quadratic_func - - @quadratic_func.setter - def quadratic_func(self, quad_matrix): - quad_matrix = _as_matrix(quad_matrix) - n_lines, n_columns = np.shape(quad_matrix) - assert(n_lines == n_columns) - - if self._linear_func is not None: - assert(np.shape(quad_matrix)[0] == self._n_variables) - else: - self._n_variables = n_lines - - self._quadratic_func = quad_matrix - - @property - def linear_func(self): - return self._linear_func - - @linear_func.setter - def linear_func(self, lin_vector): - if lin_vector is not None: - lin_vector = _as_column_matrix(lin_vector) - - if self._quadratic_func is not None: - assert(np.shape(lin_vector)[0] == self._n_variables) - - else: - self._n_variables = np.shape(lin_vector)[0] - - self._linear_func = lin_vector - - def add_inequality_constraints(self, inequality_matrix, inequality_values): - if inequality_matrix is None: - logging.info("Empty inequality constraint: ignoring!") - return - - self._assert_objective_function_is_set() - - if 1 in np.shape(inequality_matrix) or len(np.shape(inequality_matrix)) == 1: - inequality_matrix = _as_line_matrix(inequality_matrix) - else: - inequality_matrix = _as_matrix(inequality_matrix) - - inequality_values = _as_column_matrix(inequality_values) - assert np.shape(inequality_matrix)[1] == self._n_variables - assert np.shape(inequality_values)[1] == 1 - - if self._inequality_constraints_matrix is None: - self._inequality_constraints_matrix = inequality_matrix - else: - self._inequality_constraints_matrix = np.append(self._inequality_constraints_matrix, - inequality_matrix, axis=0) - - if self._inequality_constraints_values is None: - self._inequality_constraints_values = inequality_values - else: - self._inequality_constraints_values = np.append(self._inequality_constraints_values, - inequality_values, axis=0) - - def add_equality_constraints(self, equality_matrix, equality_values): - if equality_matrix is None: - logging.info("Empty equality constraint: ignoring!") - return - - self._assert_objective_function_is_set() - - if 1 in np.shape(equality_matrix) or len(np.shape(equality_matrix)) == 1: - equality_matrix = _as_line_matrix(equality_matrix) - else: - equality_matrix = _as_matrix(equality_matrix) - - equality_values = _as_matrix(equality_values) - assert np.shape(equality_matrix)[1] == self._n_variables - assert np.shape(equality_values)[1] == 1 - - if self._equality_constraints_matrix is None: - self._equality_constraints_matrix = equality_matrix - else: - self._equality_constraints_matrix = np.append(self._equality_constraints_matrix, - equality_matrix, axis=0) - - if self._equality_constraints_values is None: - self._equality_constraints_values = equality_values - else: - self._equality_constraints_values = np.append(self._equality_constraints_values, - equality_values, axis=0) - - def add_lower_bound(self, lower_bound): - if lower_bound is not None: - self._assert_objective_function_is_set() - self._lower_bound_values = np.array([lower_bound] * self._n_variables) - - def add_upper_bound(self, upper_bound): - if upper_bound is not None: - self._assert_objective_function_is_set() - self._upper_bound_values = np.array([upper_bound] * self._n_variables) - - def _convert_bounds_to_inequality_constraints(self): - self._assert_objective_function_is_set() - - if self._lower_bound_values is not None: - c_matrix = [] - for i in range(self._n_variables): - c_line = [0] * self._n_variables - c_line[i] = -1.0 - c_matrix.append(c_line) - - c_vector = _as_column_matrix(self._lower_bound_values) - self._lower_bound_values = None - self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) - - if self._upper_bound_values is not None: - c_matrix = [] - for i in range(self._n_variables): - c_line = [0] * self._n_variables - c_line[i] = 1.0 - c_matrix.append(c_line) - - c_vector = _as_column_matrix(self._upper_bound_values) - self._upper_bound_values = None - self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) - - def _convert_to_cvxopt_matrices(self): - from cvxopt import matrix as cvxopt_matrix - - if self._quadratic_func is not None: - self._quadratic_func = cvxopt_matrix(self._quadratic_func) - - if self._linear_func is not None: - self._linear_func = cvxopt_matrix(self._linear_func) - else: - # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! - self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) - - if self._inequality_constraints_matrix is not None: - self._inequality_constraints_matrix = cvxopt_matrix(self._inequality_constraints_matrix) - - if self._inequality_constraints_values is not None: - self._inequality_constraints_values = cvxopt_matrix(self._inequality_constraints_values) - - if self._equality_constraints_matrix is not None: - self._equality_constraints_matrix = cvxopt_matrix(self._equality_constraints_matrix) - - if self._equality_constraints_values is not None: - self._equality_constraints_values = cvxopt_matrix(self._equality_constraints_values) - - def _assert_objective_function_is_set(self): - assert self._n_variables is not None - - def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, return_all_information=False): - - # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. - original_stdout = sys.stdout - original_stderr = sys.stderr - - ret = None - - # TODO: Repair - # if solver == "cvxopt": - # stdout_logger = logging.getLogger('CVXOPT') - # sl = StreamToLogger(stdout_logger, logging.DEBUG) - # sys.stdout = sl - - # stderr_logger = logging.getLogger('CVXOPT') - # sl = StreamToLogger(stderr_logger, logging.WARNING) - # sys.stderr = sl - - try: - if solver == "cvxopt": - from cvxopt.solvers import qp, lp, options - options['feastol'] = feastol - options['abstol'] = abstol - options['reltol'] = reltol - options['show_progress'] = False - - self._convert_bounds_to_inequality_constraints() - self._convert_to_cvxopt_matrices() - - if self._quadratic_func is not None: - ret = qp(self.quadratic_func, self.linear_func, self._inequality_constraints_matrix, - self._inequality_constraints_values, self._equality_constraints_matrix, - self._equality_constraints_values) - - else: - ret = lp(self.linear_func, - G=self._inequality_constraints_matrix, - h=self._inequality_constraints_values, - A=self._equality_constraints_matrix, - b=self._equality_constraints_values) - - #logging.info("Primal objective value = {}".format(ret['primal objective'])) - #logging.info("Dual objective value = {}".format(ret['dual objective'])) - - if not return_all_information: - ret = np.asarray(np.array(ret['x']).T[0]) - - elif solver == "cplex": - import cplex - p = cplex.Cplex() - p.objective.set_sense(p.objective.sense.minimize) - - # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP - # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of - # them...), and the values (columns of the QP matrix). - names = [str(x) for x in range(self._n_variables)] - p.variables.add(names=names) - - if self.quadratic_func is not None: - p_matrix = [] - for line in self._quadratic_func: - p_matrix.append([names, line.tolist()[0]]) - - p.objective.set_quadratic(p_matrix) - - if self.linear_func is not None: - p.objective.set_linear(zip(names, - np.asarray(self.linear_func.T).reshape(self.n_variables,).tolist())) - - if self._inequality_constraints_matrix is not None: - inequality_linear = [] - for line in self._inequality_constraints_matrix: - inequality_linear.append([names, line.tolist()[0]]) - p.linear_constraints.add(lin_expr=inequality_linear, - rhs=np.asarray(self._inequality_constraints_values.T).tolist()[0], - senses="L"*len(self._inequality_constraints_values)) - - if self._equality_constraints_matrix is not None: - equality_linear = [] - for line in self._equality_constraints_matrix: - equality_linear.append([names, line.tolist()[0]]) - p.linear_constraints.add(lin_expr=equality_linear, - rhs=np.asarray(self._equality_constraints_values.T).tolist()[0], - senses="E"*len(self._equality_constraints_values)) - - if self._lower_bound_values is not None: - p.variables.set_lower_bounds(zip(names, self._lower_bound_values)) - - if self._upper_bound_values is not None: - p.variables.set_upper_bounds(zip(names, self._upper_bound_values)) - - p.solve() - - logging.info("Solution status = {} : {}".format(p.solution.get_status(), - p.solution.status[p.solution.get_status()])) - logging.info("Solution value = {}".format(p.solution.get_objective_value())) - - if not return_all_information: - ret = np.array(p.solution.get_values()) - else: - ret = {'primal': np.array(p.solution.get_values()), - 'dual': np.array(p.solution.get_dual_values())} - - elif solver == "pycpx": - # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). - - import pycpx - model = pycpx.CPlexModel(verbosity=2) - q = model.new(self.n_variables) - - if self._inequality_constraints_matrix is not None: - model.constrain(self._inequality_constraints_matrix * q <= self._inequality_constraints_values) - if self._equality_constraints_matrix is not None: - model.constrain(self._equality_constraints_matrix * q == self._equality_constraints_values) - if self._lower_bound_values is not None: - model.constrain(q >= self._lower_bound_values) - if self._upper_bound_values is not None: - model.constrain(q <= self._upper_bound_values) - - value = model.minimize(0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) - - logging.info("Solution value = {}".format(value)) - - if not return_all_information: - ret = np.array(model[q]) - else: - ret = model - - except: - raise - - finally: - sys.stdout = original_stdout - sys.stderr = original_stderr - - return ret - - - - - - -class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): - """Generic Attribute Threshold Binary Classifier - - Attributes - ---------- - attribute_index : int - The attribute to consider for the classification. - threshold : float - The threshold value for classification rule. - direction : int, optional - A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump - will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). - - """ - def __init__(self, attribute_index, threshold, direction=1): - super(DecisionStumpClassifier, self).__init__() - self.attribute_index = attribute_index - self.threshold = threshold - self.direction = direction - - def fit(self, X, y): - # Only verify that we are in the binary classification setting, with support for transductive learning. - if isinstance(y, np.ma.MaskedArray): - self.classes_ = np.unique(y[np.logical_not(y.mask)]) - else: - self.classes_ = np.unique(y) - - # This label encoder is there for the predict function to be able to return any two classes that were used - # when fitting, for example {-1, 1} or {0, 1}. - self.le_ = LabelEncoder() - self.le_.fit(self.classes_) - self.classes_ = self.le_.classes_ - - assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" - return self - - def predict(self, X): - """Returns the output of the classifier, on a sample X. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of features. - - Returns - ------- - predictions : array-like, shape = [n_samples] - Predicted class labels. - - """ - check_is_fitted(self, 'classes_') - return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) - - def predict_proba(self, X): - """Compute probabilities of possible outcomes for samples in X. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of features. - - Returns - ------- - avg : array-like, shape = [n_samples, n_classes] - Weighted average probability for each class per sample. - - """ - check_is_fitted(self, 'classes_') - X = np.asarray(X) - probas = np.zeros((X.shape[0], 2)) - positive_class = np.argwhere(X[:, self.attribute_index] > self.threshold) - negative_class = np.setdiff1d(range(X.shape[0]), positive_class) - probas[positive_class, 1] = 1.0 - probas[negative_class, 0] = 1.0 - - if self.direction == -1: - probas = 1 - probas - - return probas - - def reverse_decision(self): - self.direction *= -1 - - -class ClassifiersGenerator(BaseEstimator, TransformerMixin): - """Base class to create a set of voters using training samples, and then transform a set of examples in - the voters' output space. - - Attributes - ---------- - self_complemented : bool, optional - Whether or not a binary complement voter must be generated for each voter. Defaults to False. - voters : ndarray of voter functions - Once fit, contains the voter functions. - - """ - def __init__(self, self_complemented=False): - super(ClassifiersGenerator, self).__init__() - self.self_complemented = self_complemented - - def fit(self, X, y=None): - """Generates the voters using training samples. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data on which to base the voters. - y : ndarray of shape (n_labeled_samples,), optional - Input labels, usually determines the decision polarity of each voter. - - Returns - ------- - self - - """ - raise NotImplementedError - - def transform(self, X): - """Transforms the input points in a matrix of classification, using previously learned voters. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data to classify. - - Returns - ------- - ndarray of shape (n_samples, n_voters) - The voters' decision on each example. - - """ - check_is_fitted(self, 'estimators_') - return np.array([voter.predict(X) for voter in self.estimators_]).T - -class StumpsClassifiersGenerator(ClassifiersGenerator): - """Decision Stump Voters transformer. - - Parameters - ---------- - n_stumps_per_attribute : int, optional - Determines how many decision stumps will be created for each attribute. Defaults to 10. - No stumps will be created for attributes with only one possible value. - self_complemented : bool, optional - Whether or not a binary complement voter must be generated for each voter. Defaults to False. - - """ - def __init__(self, n_stumps_per_attribute=10, self_complemented=False): - super(StumpsClassifiersGenerator, self).__init__(self_complemented) - self.n_stumps_per_attribute = n_stumps_per_attribute - - def fit(self, X, y): - """Fits Decision Stump voters on a training set. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data on which to base the voters. - y : ndarray of shape (n_labeled_samples,), optional - Only used to ensure that we are in the binary classification setting. - - Returns - ------- - self - - """ - minimums = np.min(X, axis=0) - maximums = np.max(X, axis=0) - ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) - - self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, 1).fit(X, y) - for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) - if ranges[i] != 0] - - if self.self_complemented: - self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) - for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) - if ranges[i] != 0] - - self.estimators_ = np.asarray(self.estimators_) - return self - -def sign(array): - """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. - Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. - - Parameters - ---------- - array : array-like - Input values. - - Returns - ------- - ndarray - An array with the signs of input elements. - - """ - signs = np.sign(array) - - signs[array == 0] = -1 - return signs - - -def zero_one_loss(y_target, y_estimate, confidences=1): - if len(y_target) == 0: - return 0.0 - return np.mean(y_target != y_estimate) - - -def zero_one_loss_per_example(y_target, y_estimate, confidences=1): - if len(y_target) == 0: - return 0.0 - return (y_target != y_estimate).astype(np.int) - - -class ResultsDataFrame(pd.DataFrame): - """A ResultsDataFrame is a DataFrame with the following information: - - - A 'dataset' column that contains the dataset name - - Hyperparamer columns, named 'hp__HPNAME', where HPNAME is the name of the hyperparameter - - Columns containing informations about that depend on the dataset and hyperparameters, for example the risk. - - """ - @property - def datasets_list(self): - """Returns the sorted list of datasets. - - """ - return sorted(set(self['dataset'])) - - @property - def hyperparameters_list(self): - """Returns a sorted list of hyperparameter names, without the 'hp__' prefix. - - """ - return sorted(column.split('hp__')[1] for column in self.columns if column.startswith('hp__')) - - @property - def hyperparameters_list_with_prefix(self): - return sorted(column for column in self.columns if column.startswith('hp__')) - - @property - def metrics_list(self): - return sorted(column for column in self.columns if not column.startswith('hp__') and column != 'dataset') - - @property - def hyperparameters_with_values(self): - """Returns a dictionary that contains the hyperparameter names (without the 'hp__' prefix), and - associated values that are present in the DataFrame. - - """ - hyperparameters = [column for column in self.columns if column.startswith('hp__')] - - hyperparameters_dict = {} - tmp_dict = self[hyperparameters].to_dict() - - for key, value in iteritems(tmp_dict): - hyperparameters_dict[key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(set(value.values())) - - return hyperparameters_dict - - @property - def hyperparameters_with_values_per_dataset(self): - """Returns a dictionary of dictionaries that contains for each dataset, the hyperparameter names (without the - 'hp__' prefix), and associated values that are present in the DataFrame. - - """ - hyperparameters = [column for column in self.columns if column.startswith('hp__')] - - hyperparameters_dict = {} - for dataset in self.datasets_list: - tmp_dict = self[self.dataset == dataset][hyperparameters].to_dict() - hyperparameters_dict[dataset] = {} - - for key, value in iteritems(tmp_dict): - hyperparameters_dict[dataset][key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(value.values()) - - return hyperparameters_dict - - def results_optimizing_metric(self, metric_to_optimize='cv_mean__valid__zero_one_loss', minimize=True, tie_breaking_functions_ordered_dict=None): - function = min if minimize else max - - # We extract all the rows that have the best value for the metric to optimize. - optimal_results = self[self.groupby('dataset', sort=False)[metric_to_optimize].transform(function) == self[metric_to_optimize]] - - # We tie the breaks by applying the tie breaking functions (in the order of the dictionary). If hyperparameters are missing, we simply - # use the median for each hyperparameter, in a fixed (reproduceable) order. - if tie_breaking_functions_ordered_dict is None: - tie_breaking_functions_ordered_dict = OrderedDict() - else: - # Avoid side effects and ensures that the dictionary is an OrderedDict before we add missing hyperparameters. - tie_breaking_functions_ordered_dict = OrderedDict(tie_breaking_functions_ordered_dict.copy()) - - for hyperparameter in sorted(self.hyperparameters_list): - if hyperparameter not in tie_breaking_functions_ordered_dict.keys(): - tie_breaking_functions_ordered_dict[hyperparameter] = np.median - - for hyperparameter, tie_breaking_function in iteritems(tie_breaking_functions_ordered_dict): - optimal_results = optimal_results[optimal_results.groupby('dataset')['hp__' + hyperparameter].transform(partial(get_optimal_value_in_list, tie_breaking_function)) == optimal_results['hp__' + hyperparameter]] - - return ResultsDataFrame(optimal_results) - - def get_dataframe_with_metrics_as_one_column(self, metrics_to_keep=None): - new_dataframe = ResultsDataFrame() - - if metrics_to_keep is None: - metrics_to_keep = self.metrics_list - - for metric in metrics_to_keep: - columns = self.hyperparameters_list_with_prefix + [metric] - if 'dataset' in self: - columns.append('dataset') - - tmp = self.loc[:, columns] - tmp.columns = [c if c != metric else 'value' for c in tmp.columns] - tmp.loc[:, 'metric'] = metric - new_dataframe = new_dataframe.append(tmp, ignore_index=True) - - return new_dataframe - - -def get_optimal_value_in_list(optimum_function, values_list): - """Given a list of values and an optimal value, returns the value from the list that is the closest to the optimum, - given by optimum_function applied to the same list. - - >>> get_optimal_value_in_list(np.median, [2, 4, 5, 6]) - 4 + return getInterpretBase(classifier, directory, "CQBoostv2", classifier.final_vote_weights) - """ - values_list = sorted(list(values_list)) - return values_list[np.argmin(np.array([scipy.spatial.distance.euclidean(value, optimum_function(values_list)) for value in values_list]))] diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv21.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv21.py index a6f9ab442e3a1f809acf091c6e3f3bcf9340281e..b6005e4be4081c1433fcfbab9d40be905bb7de3a 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv21.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv21.py @@ -1,35 +1,28 @@ import scipy import logging -from future.utils import iteritems -from copy import deepcopy -import numpy.ma as ma -from collections import defaultdict, OrderedDict -import pandas as pd -import sys -from functools import partial import numpy as np -from scipy.spatial import distance -from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +import numpy.ma as ma +from collections import defaultdict from sklearn.utils.validation import check_is_fitted -from sklearn.preprocessing import LabelEncoder -from sklearn.tree import DecisionTreeClassifier -from sklearn.metrics.pairwise import rbf_kernel, linear_kernel -import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin from sklearn.pipeline import Pipeline -from sklearn.model_selection import RandomizedSearchCV -from sklearn.tree import DecisionTreeClassifier -from scipy.stats import randint -import numpy as np +from sklearn.metrics import accuracy_score +import time +import datetime + +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint +from ..Monoview.BoostUtils import StumpsClassifiersGenerator, sign, getInterpretBase, BaseBoost -class ColumnGenerationClassifierv21(BaseEstimator, ClassifierMixin): - def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None): +class ColumnGenerationClassifierv21(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None, random_state=42): + super(ColumnGenerationClassifierv21, self).__init__() self.epsilon = epsilon self.n_max_iterations = n_max_iterations self.estimators_generator = estimators_generator self.dual_constraint_rhs = dual_constraint_rhs self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + self.random_state = random_state def fit(self, X, y): if scipy.sparse.issparse(X): @@ -37,201 +30,214 @@ class ColumnGenerationClassifierv21(BaseEstimator, ClassifierMixin): X = np.array(X.todense()) if self.estimators_generator is None: - self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=10, self_complemented=True) + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=True) + + y[y == 0] = -1 self.estimators_generator.fit(X, y) - classification_matrix = self._binary_classification_matrix(X) + self.classification_matrix = self._binary_classification_matrix(X) + - self.chosen_columns_ = [] self.weights_ = [] self.infos_per_iteration_ = defaultdict(list) - m, n = classification_matrix.shape - self.matrix_to_optimize = 1e-08*np.ones((m,1), dtype=float) - self.n_total_hypotheses_ = n + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) - y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), classification_matrix) + # Initialization - # Initialization - alpha = self._initialize_alphas(m) - w = np.array([1.0]) + w = None self.collected_weight_vectors_ = {} self.collected_dual_constraint_violations_ = {} - for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + example_weights = self._initialize_alphas(m).reshape((m,1)) + + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.edge_scores = [] + self.example_weights_ = [example_weights] + self.train_accuracies = [] + self.previous_votes = [] + self.n_total_hypotheses_ = n + self.n_total_examples = m + # print("\n \t\t Start fit\n") + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): # Find worst weak hypothesis given alpha. - h_values = ma.array(np.squeeze(np.array(alpha.T.dot(y_kernel_matrix).T)), fill_value=-np.inf) - h_values[self.chosen_columns_] = ma.masked - worst_h_index = ma.argmax(h_values) - #logging.info("Adding voter {} to the columns, value = {}".format(worst_h_index, h_values[worst_h_index])) - - # Check for optimal solution. We ensure at least one complete iteration is done as the initialization - # values might provide a degenerate initial solution. - if h_values[worst_h_index] <= self.dual_constraint_rhs + self.epsilon and len(self.chosen_columns_) > 0: - break + new_voter_index, criterion = self._find_new_voter(example_weights, y_kernel_matrix, "pseudo_h") # Append the weak hypothesis. - self.chosen_columns_.append(worst_h_index) - - self.matrix_to_optimize = np.concatenate((np.matmul(self.matrix_to_optimize, w).reshape((m,1)), - y_kernel_matrix[:,self.chosen_columns_[-1]].reshape((m,1))), - axis=1) - - # Solve restricted master for new costs. - w, alpha = self._restricted_master_problem(self.matrix_to_optimize, previous_w=w, previous_alpha=alpha) - + self.chosen_columns_.append(new_voter_index) + self.fobidden_columns.append(new_voter_index) + new_voter_margin = y_kernel_matrix[:, self.chosen_columns_[-1]].reshape((m, 1)) + self.edge_scores.append(criterion) + + if w is None: + self.previous_vote = new_voter_margin + w = 1 + self.weights_.append(w) + example_weights = self._update_example_weights(example_weights, y_kernel_matrix, m) + self.example_weights_.append(example_weights) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + continue + + # ---- On résoud le problème à deux votants analytiquement. + w = self._solve_two_weights_min_c(new_voter_margin, example_weights) + if w[0] == "break": + self.chosen_columns_.pop() + self.break_cause = w[1] + break + self.previous_vote = np.matmul(np.concatenate((self.previous_vote, new_voter_margin), axis=1), + w).reshape((m,1)) # We collect iteration information for later evaluation. - self.weights_.append(w[1]) - - if self.save_iteration_as_hyperparameter_each is not None: - if (k + 1) % self.save_iteration_as_hyperparameter_each == 0: - self.collected_weight_vectors_[k] = deepcopy(w) - self.collected_dual_constraint_violations_[k] = h_values[worst_h_index] - self.dual_constraint_rhs - - + self.weights_.append(w[-1]) + + self.weights = np.array(self.weights_) + self.final_vote_weights = np.array([np.prod(1 - self.weights[t + 1:]) * self.weights[t] if t < + self.weights.shape[ + 0] - 1 else + self.weights[t] for t in range(self.weights.shape[0])]) + margins = np.squeeze(np.asarray(np.matmul(self.classification_matrix[:, self.chosen_columns_], + self.final_vote_weights))) + signs_array = np.array([int(x) for x in sign(margins)]) + self.train_accuracies.append(accuracy_score(y, signs_array)) + + # ---- On change l'edge + example_weights = self._update_example_weights(example_weights, y_kernel_matrix, m) + self.example_weights_.append(example_weights) + + self.nb_opposed_voters = self.check_opposed_voters() self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] - self.learner_info_ = {} - self.learner_info_.update(n_nonzero_weights=np.sum(np.asarray(self.weights_) > 1e-12)) - self.learner_info_.update(n_generated_columns=len(self.chosen_columns_)) + y[y == -1] = 0 return self def predict(self, X): + start = time.time() check_is_fitted(self, 'weights_') if scipy.sparse.issparse(X): logging.warning('Converting sparse matrix to dense matrix.') X = np.array(X.todense()) - classification_matrix = self._binary_classification_matrix(X) self.weights_ = np.array(self.weights_) - final_vote_weights = np.array(self.weights_) / np.sum(np.array(self.weights_)) - # final_vote_weights = np.array([np.prod(1-self.weights_[t+1:])*self.weights_[t] if t < self.weights_.shape[0]-1 else self.weights_[t] for t in range(self.weights_.shape[0]) ]) - margins = np.squeeze(np.asarray(np.matmul(classification_matrix, final_vote_weights))) + self.final_vote_weights = np.array([np.prod(1-self.weights_[t+1:])*self.weights_[t] if t < self.weights_.shape[0]-1 else self.weights_[t] for t in range(self.weights_.shape[0]) ]) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.final_vote_weights))) signs_array = np.array([int(x) for x in sign(margins)]) signs_array[signs_array == -1 ] = 0 + end = time.time() + self.predict_time = end-start return signs_array - def _binary_classification_matrix(self, X): - probas = self._collect_probas(X) - predicted_labels = np.argmax(probas, axis=2) - predicted_labels[predicted_labels == 0] = -1 - values = np.max(probas, axis=2) - return (predicted_labels * values).T - - def _collect_probas(self, X): - return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) - - def _restricted_master_problem(self, y_kernel_matrix): - raise NotImplementedError("Restricted master problem not implemented.") - - def _initialize_alphas(self, n_examples): - raise NotImplementedError("Alpha weights initialization function is not implemented.") + def _find_new_voter(self, example_weights, y_kernel_matrix, type="pseudo_h"): + if type == "pseudo_h": + pseudo_h_values = ma.array(np.squeeze(np.array(example_weights.T.dot(y_kernel_matrix).T)), fill_value=-np.inf) + pseudo_h_values[self.fobidden_columns] = ma.masked + worst_h_index = ma.argmax(pseudo_h_values) + return worst_h_index, pseudo_h_values[worst_h_index] + elif type == "random": + new_index = self.random_state.choice(np.arange(self.n_total_hypotheses_)) + while new_index in self.fobidden_columns: + new_index = self.random_state.choice(np.arange(self.n_total_hypotheses_)) + return new_index, 100 + + def _update_example_weights(self, example_weights, y_kernel_matrix, m): + if len(self.weights_)==1: + example_weights[self.previous_vote == -1] *= 2 + example_weights[self.previous_vote == 1 ] /= 2 + pass + else: + weights = np.array(self.weights_) + current_vote_weights = np.array([np.prod(1 - weights[t + 1:]) * weights[t] if t < + weights.shape[ + 0] - 1 else + weights[t] for t in range(weights.shape[0])]).reshape((weights.shape[0], 1)) + weighted_margin = np.matmul(y_kernel_matrix[:, self.chosen_columns_], current_vote_weights) + example_weights = np.multiply(example_weights, + np.exp((1 - np.sum(weighted_margin, axis=1) / + np.sum(weighted_margin, axis=1))).reshape((m, 1))) + return example_weights + + def _solve_two_weights_min_c(self, next_column, example_weights): + m = next_column.shape[0] + zero_diag = np.ones((m, m)) - np.identity(m) + + weighted_previous_vote = self.previous_vote.reshape((m, 1)) + weighted_next_column = next_column.reshape((m,1)) + + mat_prev = np.repeat(weighted_previous_vote, m, axis=1) * zero_diag + mat_next = np.repeat(weighted_next_column, m, axis=1) * zero_diag + + self.B2 = np.sum((weighted_previous_vote - weighted_next_column) ** 2) + self.B1 = np.sum(2 * weighted_next_column * (weighted_previous_vote - 2 * weighted_next_column * weighted_next_column)) + self.B0 = np.sum(weighted_next_column * weighted_next_column) + + self.A2 = self.B2 + np.sum((mat_prev - mat_next) * np.transpose(mat_prev - mat_next)) + self.A1 = self.B1 + np.sum(mat_prev * np.transpose(mat_next) - mat_next * np.transpose(mat_prev) - 2 * mat_next * np.transpose(mat_next)) + self.A0 = self.B0 + np.sum(mat_next * np.transpose(mat_next)) + + C2 = (self.A1 * self.B2 - self.A2 * self.B1) + C1 = 2 * (self.A0 * self.B2 - self.A2 * self.B0) + C0 = self.A0 * self.B1 - self.A1 * self.B0 + + if C2 == 0: + if C1 == 0: + return np.array([0.5, 0.5]) + elif abs(C1) > 0: + return np.array([0., 1.]) + else: + return ['break', "the derivate was constant."] + elif C2 == 0: + return ["break", "the derivate was affine."] - def evaluate_metrics(self, X, y, metrics_list=None, functions_list=None): - if metrics_list is None: - metrics_list = [zero_one_loss, zero_one_loss_per_example] + sols = np.roots(np.array([C2, C1, C0])) - if functions_list is None: - functions_list = [] + is_acceptable, sol = self._analyze_solutions(sols) + if is_acceptable: + # print("cb", self._cborn(sol)) + return np.array([sol, 1-sol]) + else: + return ["break", sol] - # Predict, evaluate metrics. - classification_matrix = self._binary_classification_matrix(X) - predictions = sign(classification_matrix.dot(self.weights_)) + def _analyze_solutions(self, sols): + if sols.shape[0] == 1: + if self._cborn(sols[0]) < self._cborn(sols[0]+1): + best_sol = sols[0] + else: + return False, " the only solution was a maximum." + elif sols.shape[0] == 2: + best_sol = self._best_sol(sols) + else: + return False, " no solution were found." - if self.save_iteration_as_hyperparameter_each is None: - metrics_results = {} - for metric in metrics_list: - metrics_results[metric.__name__] = metric(y, predictions) + if 0 < best_sol < 1: + return True, self._best_sol(sols) - metrics_dataframe = ResultsDataFrame([metrics_results]) - return metrics_dataframe + elif best_sol <= 0: + return False, " the minimum was below 0." + else: + return False, " the minimum was over 1." - # If we collected iteration informations to add a hyperparameter, we add an index with the hyperparameter name - # and return a ResultsDataFrame containing one row per hyperparameter value. - metrics_dataframe = ResultsDataFrame() - for t, weights in iteritems(self.collected_weight_vectors_): - predictions = sign(classification_matrix[:, :t + 1].dot(weights)) - metrics_results = {metric.__name__: metric(y, predictions) for metric in metrics_list} - for function in functions_list: - metrics_results[function.__name__] = function(classification_matrix[:, :t + 1], y, weights) + def _cborn(self, sol): + return 1 - (self.A2*sol**2 + self.A1*sol + self.A0)/(self.B2*sol**2 + self.B1*sol + self.B0) - # We add other collected information. - metrics_results['chosen_columns'] = self.chosen_columns_[t] - metrics_results['dual_constraint_violation'] = self.collected_dual_constraint_violations_[t] + def _best_sol(self, sols): + values = np.array([self._cborn(sol) for sol in sols]) + return sols[np.argmin(values)] - metrics_dataframe = metrics_dataframe.append(ResultsDataFrame([metrics_results], index=[t])) + def _restricted_master_problem(self, y_kernel_matrix): + raise NotImplementedError("Restricted master problem not implemented.") - metrics_dataframe.index.name = 'hp__n_iterations' - return metrics_dataframe class CqBoostClassifierv21(ColumnGenerationClassifierv21): - def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None, random_state=42): super(CqBoostClassifierv21, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, - save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each) - # TODO: Vérifier la valeur de nu (dual_constraint_rhs) à l'initialisation, mais de toute manière ignorée car - # on ne peut pas quitter la boucle principale avec seulement un votant. + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each, random_state=random_state) + self.train_time = 0 self.mu = mu - def _restricted_master_problem(self, y_kernel_matrix, previous_w=None, previous_alpha=None): - n_examples, n_hypotheses = y_kernel_matrix.shape - - m_eye = np.eye(n_examples) - m_ones = np.ones((n_examples, 1)) - - qp_a = np.vstack((np.hstack((-y_kernel_matrix, m_eye)), - np.hstack((np.ones((1, n_hypotheses)), np.zeros((1, n_examples)))))) - - qp_b = np.vstack((np.zeros((n_examples, 1)), - np.array([1.0]).reshape((1, 1)))) - - qp_g = np.vstack((np.hstack((-np.eye(n_hypotheses), np.zeros((n_hypotheses, n_examples)))), - np.hstack((np.zeros((1, n_hypotheses)), - 1.0 / n_examples * m_ones.T)))) - - qp_h = np.vstack((np.zeros((n_hypotheses, 1)), - np.array([-self.mu]).reshape((1, 1)))) - - qp = ConvexProgram() - qp.quadratic_func = 2.0 / n_examples * np.vstack((np.hstack((np.zeros((n_hypotheses, n_hypotheses)), np.zeros((n_hypotheses, n_examples)))), - np.hstack((np.zeros((n_examples, n_hypotheses)), m_eye)))) - - qp.add_equality_constraints(qp_a, qp_b) - qp.add_inequality_constraints(qp_g, qp_h) - - if previous_w is not None: - qp.initial_values = np.append(previous_w, [0]) - - try: - solver_result = qp.solve(abstol=1e-10, reltol=1e-10, feastol=1e-10, return_all_information=True) - w = np.asarray(np.array(solver_result['x']).T[0])[:n_hypotheses] - - # The alphas are the Lagrange multipliers associated with the equality constraints (returned as the y vector in CVXOPT). - dual_variables = np.asarray(np.array(solver_result['y']).T[0]) - alpha = dual_variables[:n_examples] - - # Set the dual constraint right-hand side to be equal to the last lagrange multiplier (nu). - # Hack: do not change nu if the QP didn't fully solve... - if solver_result['dual slack'] <= 1e-8: - self.dual_constraint_rhs = dual_variables[-1] - # logging.info('Updating dual constraint rhs: {}'.format(self.dual_constraint_rhs)) - - except: - logging.warning('QP Solving failed at iteration {}.'.format(n_hypotheses)) - if previous_w is not None: - w = np.append(previous_w, [0]) - else: - w = np.array([1.0 / n_hypotheses] * n_hypotheses) - - if previous_alpha is not None: - alpha = previous_alpha - else: - alpha = self._initialize_alphas(n_examples) - - return w, alpha - def _initialize_alphas(self, n_examples): return 1.0 / n_examples * np.ones((n_examples,)) @@ -243,7 +249,7 @@ class CQBoostv21(CqBoostClassifierv21): mu=kwargs['mu'], epsilon=kwargs['epsilon'], n_max_iterations= kwargs['n_max_iterations'], - ) + random_state = random_state) def canProbas(self): """Used to know if the classifier can return label probabilities""" @@ -305,11 +311,14 @@ def canProbas(): def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): """Used to fit the monoview classifier with the args stored in kwargs""" + start = time.time() classifier = CqBoostClassifierv21(mu=kwargs['mu'], epsilon=kwargs['epsilon'], - n_max_iterations=kwargs["n_max_iterations"], ) - # random_state=randomState) + n_max_iterations=kwargs["n_max_iterations"], + random_state=randomState) classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time = end-start return classifier @@ -317,8 +326,8 @@ def paramsToSet(nIter, randomState): """Used for weighted linear early fusion to generate random search sets""" paramsSet = [] for _ in range(nIter): - paramsSet.append({"mu": randomState.choice([0.001, 0.002]), - "epsilon": randomState.choice([1e-08, 2e-08]), + paramsSet.append({"mu": randomState.uniform(1e-02, 10**(-0.5)), + "epsilon": 10**-randomState.randint(1, 15), "n_max_iterations": None}) return paramsSet @@ -337,9 +346,10 @@ def genPipeline(): def genParamsDict(randomState): - return {"classifier__mu": [0.001, 0.002], - "classifier__epsilon": [1e-08, 2e-08], - "classifier__n_max_iterations": [None]} + return {"classifier__mu": CustomUniform(loc=.5, state=2, multiplier='e-'), + "classifier__epsilon": CustomRandint(low=1, high=15, multiplier='e-'), + "classifier__n_max_iterations": [None], + "classifier__random_state":[randomState]} def genBestParams(detector): @@ -350,7 +360,7 @@ def genBestParams(detector): def genParamsFromDetector(detector): nIter = len(detector.cv_results_['param_classifier__mu']) - return [("mu", np.array([0.001 for _ in range(nIter)])), + return [("mu", np.array(detector.cv_results_['param_classifier__mu'])), ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] @@ -365,709 +375,6 @@ def getConfig(config): def getInterpret(classifier, directory): - interpretString = "" - return interpretString - - - - - -def _as_matrix(element): - """ Utility function to convert "anything" to a Numpy matrix. - """ - # If a scalar, return a 1x1 matrix. - if len(np.shape(element)) == 0: - return np.matrix([[element]], dtype=float) - - # If a nd-array vector, return a column matrix. - elif len(np.shape(element)) == 1: - matrix = np.matrix(element, dtype=float) - if np.shape(matrix)[1] != 1: - matrix = matrix.T - return matrix - - return np.matrix(element, dtype=float) - - -def _as_column_matrix(array_like): - """ Utility function to convert any array to a column Numpy matrix. - """ - matrix = _as_matrix(array_like) - if 1 not in np.shape(matrix): - raise ValueError("_as_column_vector: input must be a vector") - - if np.shape(matrix)[0] == 1: - matrix = matrix.T - - return matrix - - -def _as_line_matrix(array_like): - """ Utility function to convert any array to a line Numpy matrix. - """ - matrix = _as_matrix(array_like) - if 1 not in np.shape(matrix): - raise ValueError("_as_column_vector: input must be a vector") - - if np.shape(matrix)[1] == 1: - matrix = matrix.T - - return matrix - - -class ConvexProgram(object): - """ - Encapsulates a quadratic program of the following form: - - minimize (1/2)*x'*P*x + q'*x - subject to G*x <= h - A*x = b. - - - or a linear program of the following form: - - minimize c'*x - subject to G*x <= h - A*x = b - """ - def __init__(self): - self._quadratic_func = None - self._linear_func = None - self._inequality_constraints_matrix = None - self._inequality_constraints_values = None - self._equality_constraints_matrix = None - self._equality_constraints_values = None - self._lower_bound_values = None - self._upper_bound_values = None - self._n_variables = None - - @property - def n_variables(self): - return self._n_variables - - @property - def quadratic_func(self): - return self._quadratic_func - - @quadratic_func.setter - def quadratic_func(self, quad_matrix): - quad_matrix = _as_matrix(quad_matrix) - n_lines, n_columns = np.shape(quad_matrix) - assert(n_lines == n_columns) - - if self._linear_func is not None: - assert(np.shape(quad_matrix)[0] == self._n_variables) - else: - self._n_variables = n_lines - - self._quadratic_func = quad_matrix - - @property - def linear_func(self): - return self._linear_func - - @linear_func.setter - def linear_func(self, lin_vector): - if lin_vector is not None: - lin_vector = _as_column_matrix(lin_vector) - - if self._quadratic_func is not None: - assert(np.shape(lin_vector)[0] == self._n_variables) - - else: - self._n_variables = np.shape(lin_vector)[0] - - self._linear_func = lin_vector - - def add_inequality_constraints(self, inequality_matrix, inequality_values): - if inequality_matrix is None: - logging.info("Empty inequality constraint: ignoring!") - return - - self._assert_objective_function_is_set() - - if 1 in np.shape(inequality_matrix) or len(np.shape(inequality_matrix)) == 1: - inequality_matrix = _as_line_matrix(inequality_matrix) - else: - inequality_matrix = _as_matrix(inequality_matrix) - - inequality_values = _as_column_matrix(inequality_values) - assert np.shape(inequality_matrix)[1] == self._n_variables - assert np.shape(inequality_values)[1] == 1 - - if self._inequality_constraints_matrix is None: - self._inequality_constraints_matrix = inequality_matrix - else: - self._inequality_constraints_matrix = np.append(self._inequality_constraints_matrix, - inequality_matrix, axis=0) - - if self._inequality_constraints_values is None: - self._inequality_constraints_values = inequality_values - else: - self._inequality_constraints_values = np.append(self._inequality_constraints_values, - inequality_values, axis=0) - - def add_equality_constraints(self, equality_matrix, equality_values): - if equality_matrix is None: - logging.info("Empty equality constraint: ignoring!") - return - - self._assert_objective_function_is_set() - - if 1 in np.shape(equality_matrix) or len(np.shape(equality_matrix)) == 1: - equality_matrix = _as_line_matrix(equality_matrix) - else: - equality_matrix = _as_matrix(equality_matrix) - - equality_values = _as_matrix(equality_values) - assert np.shape(equality_matrix)[1] == self._n_variables - assert np.shape(equality_values)[1] == 1 - - if self._equality_constraints_matrix is None: - self._equality_constraints_matrix = equality_matrix - else: - self._equality_constraints_matrix = np.append(self._equality_constraints_matrix, - equality_matrix, axis=0) - - if self._equality_constraints_values is None: - self._equality_constraints_values = equality_values - else: - self._equality_constraints_values = np.append(self._equality_constraints_values, - equality_values, axis=0) - - def add_lower_bound(self, lower_bound): - if lower_bound is not None: - self._assert_objective_function_is_set() - self._lower_bound_values = np.array([lower_bound] * self._n_variables) - - def add_upper_bound(self, upper_bound): - if upper_bound is not None: - self._assert_objective_function_is_set() - self._upper_bound_values = np.array([upper_bound] * self._n_variables) - - def _convert_bounds_to_inequality_constraints(self): - self._assert_objective_function_is_set() - - if self._lower_bound_values is not None: - c_matrix = [] - for i in range(self._n_variables): - c_line = [0] * self._n_variables - c_line[i] = -1.0 - c_matrix.append(c_line) - - c_vector = _as_column_matrix(self._lower_bound_values) - self._lower_bound_values = None - self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) - - if self._upper_bound_values is not None: - c_matrix = [] - for i in range(self._n_variables): - c_line = [0] * self._n_variables - c_line[i] = 1.0 - c_matrix.append(c_line) - - c_vector = _as_column_matrix(self._upper_bound_values) - self._upper_bound_values = None - self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) - - def _convert_to_cvxopt_matrices(self): - from cvxopt import matrix as cvxopt_matrix - - if self._quadratic_func is not None: - self._quadratic_func = cvxopt_matrix(self._quadratic_func) - - if self._linear_func is not None: - self._linear_func = cvxopt_matrix(self._linear_func) - else: - # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! - self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) - - if self._inequality_constraints_matrix is not None: - self._inequality_constraints_matrix = cvxopt_matrix(self._inequality_constraints_matrix) - - if self._inequality_constraints_values is not None: - self._inequality_constraints_values = cvxopt_matrix(self._inequality_constraints_values) - - if self._equality_constraints_matrix is not None: - self._equality_constraints_matrix = cvxopt_matrix(self._equality_constraints_matrix) - - if self._equality_constraints_values is not None: - self._equality_constraints_values = cvxopt_matrix(self._equality_constraints_values) - - def _assert_objective_function_is_set(self): - assert self._n_variables is not None - - def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, return_all_information=False): - - # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. - original_stdout = sys.stdout - original_stderr = sys.stderr - - ret = None - - # TODO: Repair - # if solver == "cvxopt": - # stdout_logger = logging.getLogger('CVXOPT') - # sl = StreamToLogger(stdout_logger, logging.DEBUG) - # sys.stdout = sl - - # stderr_logger = logging.getLogger('CVXOPT') - # sl = StreamToLogger(stderr_logger, logging.WARNING) - # sys.stderr = sl - - try: - if solver == "cvxopt": - from cvxopt.solvers import qp, lp, options - options['feastol'] = feastol - options['abstol'] = abstol - options['reltol'] = reltol - options['show_progress'] = False - - self._convert_bounds_to_inequality_constraints() - self._convert_to_cvxopt_matrices() - - if self._quadratic_func is not None: - ret = qp(self.quadratic_func, self.linear_func, self._inequality_constraints_matrix, - self._inequality_constraints_values, self._equality_constraints_matrix, - self._equality_constraints_values) - - else: - ret = lp(self.linear_func, - G=self._inequality_constraints_matrix, - h=self._inequality_constraints_values, - A=self._equality_constraints_matrix, - b=self._equality_constraints_values) - - #logging.info("Primal objective value = {}".format(ret['primal objective'])) - #logging.info("Dual objective value = {}".format(ret['dual objective'])) - - if not return_all_information: - ret = np.asarray(np.array(ret['x']).T[0]) - - elif solver == "cplex": - import cplex - p = cplex.Cplex() - p.objective.set_sense(p.objective.sense.minimize) - - # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP - # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of - # them...), and the values (columns of the QP matrix). - names = [str(x) for x in range(self._n_variables)] - p.variables.add(names=names) - - if self.quadratic_func is not None: - p_matrix = [] - for line in self._quadratic_func: - p_matrix.append([names, line.tolist()[0]]) - - p.objective.set_quadratic(p_matrix) - - if self.linear_func is not None: - p.objective.set_linear(zip(names, - np.asarray(self.linear_func.T).reshape(self.n_variables,).tolist())) - - if self._inequality_constraints_matrix is not None: - inequality_linear = [] - for line in self._inequality_constraints_matrix: - inequality_linear.append([names, line.tolist()[0]]) - p.linear_constraints.add(lin_expr=inequality_linear, - rhs=np.asarray(self._inequality_constraints_values.T).tolist()[0], - senses="L"*len(self._inequality_constraints_values)) - - if self._equality_constraints_matrix is not None: - equality_linear = [] - for line in self._equality_constraints_matrix: - equality_linear.append([names, line.tolist()[0]]) - p.linear_constraints.add(lin_expr=equality_linear, - rhs=np.asarray(self._equality_constraints_values.T).tolist()[0], - senses="E"*len(self._equality_constraints_values)) - - if self._lower_bound_values is not None: - p.variables.set_lower_bounds(zip(names, self._lower_bound_values)) - - if self._upper_bound_values is not None: - p.variables.set_upper_bounds(zip(names, self._upper_bound_values)) - - p.solve() - - logging.info("Solution status = {} : {}".format(p.solution.get_status(), - p.solution.status[p.solution.get_status()])) - logging.info("Solution value = {}".format(p.solution.get_objective_value())) - - if not return_all_information: - ret = np.array(p.solution.get_values()) - else: - ret = {'primal': np.array(p.solution.get_values()), - 'dual': np.array(p.solution.get_dual_values())} - - elif solver == "pycpx": - # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). - - import pycpx - model = pycpx.CPlexModel(verbosity=2) - q = model.new(self.n_variables) - - if self._inequality_constraints_matrix is not None: - model.constrain(self._inequality_constraints_matrix * q <= self._inequality_constraints_values) - if self._equality_constraints_matrix is not None: - model.constrain(self._equality_constraints_matrix * q == self._equality_constraints_values) - if self._lower_bound_values is not None: - model.constrain(q >= self._lower_bound_values) - if self._upper_bound_values is not None: - model.constrain(q <= self._upper_bound_values) - - value = model.minimize(0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) - - logging.info("Solution value = {}".format(value)) - - if not return_all_information: - ret = np.array(model[q]) - else: - ret = model - - except: - raise - - finally: - sys.stdout = original_stdout - sys.stderr = original_stderr - - return ret - - - - - - -class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): - """Generic Attribute Threshold Binary Classifier - - Attributes - ---------- - attribute_index : int - The attribute to consider for the classification. - threshold : float - The threshold value for classification rule. - direction : int, optional - A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump - will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). - - """ - def __init__(self, attribute_index, threshold, direction=1): - super(DecisionStumpClassifier, self).__init__() - self.attribute_index = attribute_index - self.threshold = threshold - self.direction = direction - - def fit(self, X, y): - # Only verify that we are in the binary classification setting, with support for transductive learning. - if isinstance(y, np.ma.MaskedArray): - self.classes_ = np.unique(y[np.logical_not(y.mask)]) - else: - self.classes_ = np.unique(y) - - # This label encoder is there for the predict function to be able to return any two classes that were used - # when fitting, for example {-1, 1} or {0, 1}. - self.le_ = LabelEncoder() - self.le_.fit(self.classes_) - self.classes_ = self.le_.classes_ - - assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" - return self - - def predict(self, X): - """Returns the output of the classifier, on a sample X. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of features. - - Returns - ------- - predictions : array-like, shape = [n_samples] - Predicted class labels. - - """ - check_is_fitted(self, 'classes_') - return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) - - def predict_proba(self, X): - """Compute probabilities of possible outcomes for samples in X. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of features. - - Returns - ------- - avg : array-like, shape = [n_samples, n_classes] - Weighted average probability for each class per sample. - - """ - check_is_fitted(self, 'classes_') - X = np.asarray(X) - probas = np.zeros((X.shape[0], 2)) - positive_class = np.argwhere(X[:, self.attribute_index] > self.threshold) - negative_class = np.setdiff1d(range(X.shape[0]), positive_class) - probas[positive_class, 1] = 1.0 - probas[negative_class, 0] = 1.0 - - if self.direction == -1: - probas = 1 - probas - - return probas - - def reverse_decision(self): - self.direction *= -1 - - -class ClassifiersGenerator(BaseEstimator, TransformerMixin): - """Base class to create a set of voters using training samples, and then transform a set of examples in - the voters' output space. - - Attributes - ---------- - self_complemented : bool, optional - Whether or not a binary complement voter must be generated for each voter. Defaults to False. - voters : ndarray of voter functions - Once fit, contains the voter functions. - - """ - def __init__(self, self_complemented=False): - super(ClassifiersGenerator, self).__init__() - self.self_complemented = self_complemented - - def fit(self, X, y=None): - """Generates the voters using training samples. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data on which to base the voters. - y : ndarray of shape (n_labeled_samples,), optional - Input labels, usually determines the decision polarity of each voter. - - Returns - ------- - self - - """ - raise NotImplementedError - - def transform(self, X): - """Transforms the input points in a matrix of classification, using previously learned voters. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data to classify. - - Returns - ------- - ndarray of shape (n_samples, n_voters) - The voters' decision on each example. - - """ - check_is_fitted(self, 'estimators_') - return np.array([voter.predict(X) for voter in self.estimators_]).T - -class StumpsClassifiersGenerator(ClassifiersGenerator): - """Decision Stump Voters transformer. - - Parameters - ---------- - n_stumps_per_attribute : int, optional - Determines how many decision stumps will be created for each attribute. Defaults to 10. - No stumps will be created for attributes with only one possible value. - self_complemented : bool, optional - Whether or not a binary complement voter must be generated for each voter. Defaults to False. - - """ - def __init__(self, n_stumps_per_attribute=10, self_complemented=False): - super(StumpsClassifiersGenerator, self).__init__(self_complemented) - self.n_stumps_per_attribute = n_stumps_per_attribute - - def fit(self, X, y): - """Fits Decision Stump voters on a training set. - - Parameters - ---------- - X : ndarray of shape (n_samples, n_features) - Input data on which to base the voters. - y : ndarray of shape (n_labeled_samples,), optional - Only used to ensure that we are in the binary classification setting. - - Returns - ------- - self - - """ - minimums = np.min(X, axis=0) - maximums = np.max(X, axis=0) - ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) - - self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, 1).fit(X, y) - for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) - if ranges[i] != 0] - - if self.self_complemented: - self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) - for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) - if ranges[i] != 0] - - self.estimators_ = np.asarray(self.estimators_) - return self - -def sign(array): - """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. - Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. - - Parameters - ---------- - array : array-like - Input values. - - Returns - ------- - ndarray - An array with the signs of input elements. - - """ - signs = np.sign(array) - - signs[array == 0] = -1 - return signs - - -def zero_one_loss(y_target, y_estimate, confidences=1): - if len(y_target) == 0: - return 0.0 - return np.mean(y_target != y_estimate) - - -def zero_one_loss_per_example(y_target, y_estimate, confidences=1): - if len(y_target) == 0: - return 0.0 - return (y_target != y_estimate).astype(np.int) - - -class ResultsDataFrame(pd.DataFrame): - """A ResultsDataFrame is a DataFrame with the following information: - - - A 'dataset' column that contains the dataset name - - Hyperparamer columns, named 'hp__HPNAME', where HPNAME is the name of the hyperparameter - - Columns containing informations about that depend on the dataset and hyperparameters, for example the risk. - - """ - @property - def datasets_list(self): - """Returns the sorted list of datasets. - - """ - return sorted(set(self['dataset'])) - - @property - def hyperparameters_list(self): - """Returns a sorted list of hyperparameter names, without the 'hp__' prefix. - - """ - return sorted(column.split('hp__')[1] for column in self.columns if column.startswith('hp__')) - - @property - def hyperparameters_list_with_prefix(self): - return sorted(column for column in self.columns if column.startswith('hp__')) - - @property - def metrics_list(self): - return sorted(column for column in self.columns if not column.startswith('hp__') and column != 'dataset') - - @property - def hyperparameters_with_values(self): - """Returns a dictionary that contains the hyperparameter names (without the 'hp__' prefix), and - associated values that are present in the DataFrame. - - """ - hyperparameters = [column for column in self.columns if column.startswith('hp__')] - - hyperparameters_dict = {} - tmp_dict = self[hyperparameters].to_dict() - - for key, value in iteritems(tmp_dict): - hyperparameters_dict[key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(set(value.values())) - - return hyperparameters_dict - - @property - def hyperparameters_with_values_per_dataset(self): - """Returns a dictionary of dictionaries that contains for each dataset, the hyperparameter names (without the - 'hp__' prefix), and associated values that are present in the DataFrame. - - """ - hyperparameters = [column for column in self.columns if column.startswith('hp__')] - - hyperparameters_dict = {} - for dataset in self.datasets_list: - tmp_dict = self[self.dataset == dataset][hyperparameters].to_dict() - hyperparameters_dict[dataset] = {} - - for key, value in iteritems(tmp_dict): - hyperparameters_dict[dataset][key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(value.values()) - - return hyperparameters_dict - - def results_optimizing_metric(self, metric_to_optimize='cv_mean__valid__zero_one_loss', minimize=True, tie_breaking_functions_ordered_dict=None): - function = min if minimize else max - - # We extract all the rows that have the best value for the metric to optimize. - optimal_results = self[self.groupby('dataset', sort=False)[metric_to_optimize].transform(function) == self[metric_to_optimize]] - - # We tie the breaks by applying the tie breaking functions (in the order of the dictionary). If hyperparameters are missing, we simply - # use the median for each hyperparameter, in a fixed (reproduceable) order. - if tie_breaking_functions_ordered_dict is None: - tie_breaking_functions_ordered_dict = OrderedDict() - else: - # Avoid side effects and ensures that the dictionary is an OrderedDict before we add missing hyperparameters. - tie_breaking_functions_ordered_dict = OrderedDict(tie_breaking_functions_ordered_dict.copy()) - - for hyperparameter in sorted(self.hyperparameters_list): - if hyperparameter not in tie_breaking_functions_ordered_dict.keys(): - tie_breaking_functions_ordered_dict[hyperparameter] = np.median - - for hyperparameter, tie_breaking_function in iteritems(tie_breaking_functions_ordered_dict): - optimal_results = optimal_results[optimal_results.groupby('dataset')['hp__' + hyperparameter].transform(partial(get_optimal_value_in_list, tie_breaking_function)) == optimal_results['hp__' + hyperparameter]] - - return ResultsDataFrame(optimal_results) - - def get_dataframe_with_metrics_as_one_column(self, metrics_to_keep=None): - new_dataframe = ResultsDataFrame() - - if metrics_to_keep is None: - metrics_to_keep = self.metrics_list - - for metric in metrics_to_keep: - columns = self.hyperparameters_list_with_prefix + [metric] - if 'dataset' in self: - columns.append('dataset') - - tmp = self.loc[:, columns] - tmp.columns = [c if c != metric else 'value' for c in tmp.columns] - tmp.loc[:, 'metric'] = metric - new_dataframe = new_dataframe.append(tmp, ignore_index=True) - - return new_dataframe - - -def get_optimal_value_in_list(optimum_function, values_list): - """Given a list of values and an optimal value, returns the value from the list that is the closest to the optimum, - given by optimum_function applied to the same list. + return getInterpretBase(classifier, directory, "CQBoostv21", classifier.final_vote_weights, classifier.break_cause) - >>> get_optimal_value_in_list(np.median, [2, 4, 5, 6]) - 4 - """ - values_list = sorted(list(values_list)) - return values_list[np.argmin(np.array([scipy.spatial.distance.euclidean(value, optimum_function(values_list)) for value in values_list]))] diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoost.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoost.py new file mode 100644 index 0000000000000000000000000000000000000000..ea91110525e97cbb2509928bfa1a23bac5ae9716 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoost.py @@ -0,0 +1,403 @@ +import scipy +import logging +import numpy as np +import numpy.ma as ma +from collections import defaultdict +import math +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score +import time +import datetime + +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint +from ..Monoview.BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, getInterpretBase + + +class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None, random_state=42): + super(ColumnGenerationClassifierQar, self).__init__() + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + self.random_state = random_state + + def fit(self, X, y): + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=True) + + y[y == 0] = -1 + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + + + self.weights_ = [] + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) + + # Initialization + + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + self.example_weights = self._initialize_alphas(m).reshape((m,1)) + + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.edge_scores = [] + self.epsilons = [] + self.example_weights_ = [self.example_weights] + self.train_accuracies = [] + self.previous_votes = [] + + self.n_total_hypotheses_ = n + self.n_total_examples = m + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # To choose the first voter, we select the one that has the best margin. + if k == 0: + first_voter_index = self._find_best_margin(y_kernel_matrix) + self.chosen_columns_.append(first_voter_index) + + self.previous_vote = self.classification_matrix[:, first_voter_index].reshape((m,1)) + self.weighted_sum = self.classification_matrix[:, first_voter_index].reshape((m,1)) + + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + self.q = math.log((1-epsilon)/epsilon) + self.weights_.append(self.q) + + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + continue + + # Find best weak hypothesis given example_weights. Select the one that has the lowest minimum + # C-bound with the previous vote + sol, new_voter_index = self._find_new_voter(y_kernel_matrix, y) + if type(sol) == str: + self.break_cause = " no more hypothesis were able to improve the boosted vote." + break + + # Append the weak hypothesis. + self.chosen_columns_.append(new_voter_index) + self.weighted_sum = np.matmul(np.concatenate((self.previous_vote, self.classification_matrix[:, new_voter_index].reshape((m,1))), axis=1), + sol).reshape((m,1)) + + # Generate the new weight for the new voter + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + if epsilon == 0. or math.log((1 - epsilon) / epsilon) == math.inf: + self.chosen_columns_.pop() + self.break_cause = " epsilon was too small." + break + self.q = math.log((1 - epsilon) / epsilon) + self.weights_.append(self.q) + + # Update the distribution on the examples. + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + + # Update the "previous vote" to prepare for the next iteration + self.previous_vote = np.matmul(self.classification_matrix[:, self.chosen_columns_], + np.array(self.weights_).reshape((k + 1, 1))).reshape((m, 1)) + self.previous_votes.append(self.previous_vote) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + + self.nb_opposed_voters = self.check_opposed_voters() + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + self.weights_ = np.array(self.weights_) + + self.weights_/=np.sum(self.weights_) + y[y == -1] = 0 + + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + classification_matrix = self._binary_classification_matrix(X) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.weights_))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + end = time.time() + self.predict_time = end - start + return signs_array + + def _compute_epsilon(self,): + """Updating the \epsilon varaible""" + ones_matrix = np.zeros(self.weighted_sum.shape) + ones_matrix[self.weighted_sum < 0] = 1 + epsilon = (1.0/self.n_total_examples)*np.sum(self.example_weights*ones_matrix, axis=0) + return epsilon + + def _find_best_margin(self, y_kernel_matrix): + """Used only on the first iteration to select the voter with the largest margin""" + pseudo_h_values = ma.array(np.sum(y_kernel_matrix, axis=0), fill_value=-np.inf) + pseudo_h_values[self.fobidden_columns] = ma.masked + worst_h_index = ma.argmax(pseudo_h_values) + return worst_h_index + + def _find_new_voter(self, y_kernel_matrix, y): + """Here, we solve the two_voters_mincq_problem for each potential new voter, + and select the one that has the smallest minimum""" + c_borns = [] + possible_sols = [] + indices = [] + for hypothese_index, hypothese in enumerate(y_kernel_matrix.transpose()): + causes = [] + if hypothese_index not in self.chosen_columns_: + w = self._solve_two_weights_min_c(hypothese, y) + if w[0] != "break": + c_borns.append(self._cbound(w[0])) + possible_sols.append(w) + indices.append(hypothese_index) + else: + causes.append(w[1]) + if c_borns: + min_c_born_index = ma.argmin(c_borns) + selected_sol = possible_sols[min_c_born_index] + selected_voter_index = indices[min_c_born_index] + return selected_sol, selected_voter_index + else: + return "break", "smthng" + + def _update_example_weights(self, y): + new_weights = self.example_weights*np.exp(-self.q*y.reshape((self.n_total_examples, 1))*self.weighted_sum) + self.example_weights = new_weights/np.sum(new_weights) + + def _solve_two_weights_min_c(self, next_column, y): + """Here we solve the min C-bound problem for two voters and return the best 2-weights array""" + m = next_column.shape[0] + zero_diag = np.ones((m, m)) - np.identity(m) + + weighted_previous_sum = np.multiply(np.multiply(y.reshape((m, 1)), self.previous_vote.reshape((m, 1))), self.example_weights.reshape((m,1))) + weighted_next_column = np.multiply(next_column.reshape((m,1)), self.example_weights.reshape((m,1))) + + mat_prev = np.repeat(weighted_previous_sum, m, axis=1) * zero_diag + mat_next = np.repeat(weighted_next_column, m, axis=1) * zero_diag + + self.B2 = np.sum((weighted_previous_sum - weighted_next_column) ** 2) + self.B1 = np.sum(2 * weighted_next_column * (weighted_previous_sum - 2 * weighted_next_column * weighted_next_column)) + self.B0 = np.sum(weighted_next_column * weighted_next_column) + + self.A2 = self.B2 + np.sum((mat_prev - mat_next) * np.transpose(mat_prev - mat_next)) + self.A1 = self.B1 + np.sum(mat_prev * np.transpose(mat_next) - mat_next * np.transpose(mat_prev) - 2 * mat_next * np.transpose(mat_next)) + self.A0 = self.B0 + np.sum(mat_next * np.transpose(mat_next)) + C2 = (self.A1 * self.B2 - self.A2 * self.B1) + C1 = 2 * (self.A0 * self.B2 - self.A2 * self.B0) + C0 = self.A0 * self.B1 - self.A1 * self.B0 + + if C2 == 0: + if C1 == 0: + return np.array([0.5, 0.5]) + elif abs(C1) > 0: + return np.array([0., 1.]) + else: + return ['break', "the derivate was constant."] + elif C2 == 0: + return ["break", "the derivate was affine."] + try: + sols = np.roots(np.array([C2, C1, C0])) + except: + return ["break", "nan"] + + is_acceptable, sol = self._analyze_solutions(sols) + if is_acceptable: + return np.array([sol, 1-sol]) + else: + return ["break", sol] + + def _analyze_solutions(self, sols): + """"We just check that the solution found by np.roots is acceptable under our constraints + (real, a minimum and between 0 and 1)""" + for sol_index, sol in enumerate(sols): + if isinstance(sol, complex): + sols[sol_index] = -1 + if sols.shape[0] == 1: + if self._cbound(sols[0]) < self._cbound(sols[0] + 1): + best_sol = sols[0] + else: + return False, " the only solution was a maximum." + elif sols.shape[0] == 2: + best_sol = self._best_sol(sols) + else: + return False, " no solution were found." + + if 0 < best_sol < 1: + return True, self._best_sol(sols) + + elif best_sol <= 0: + return False, " the minimum was below 0." + else: + return False, " the minimum was over 1." + + def _cbound(self, sol): + """Computing the objective function""" + return 1 - (self.A2*sol**2 + self.A1*sol + self.A0)/(self.B2*sol**2 + self.B1*sol + self.B0) + + def _best_sol(self, sols): + values = np.array([self._cbound(sol) for sol in sols]) + return sols[np.argmin(values)] + + + +class QarBoostClassifier(ColumnGenerationClassifierQar): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None, random_state=42): + super(QarBoostClassifier, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each, random_state=random_state) + self.mu = mu + self.train_time = 0 + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class QarBoost(QarBoostClassifier): + + def __init__(self, random_state, **kwargs): + super(QarBoost, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + random_state = random_state) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', QarBoostClassifier())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + start =time.time() + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = QarBoostClassifier(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"], + random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time = end-start + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.uniform(1e-02, 10**(-0.5)), + "epsilon": 10**-randomState.randint(1, 15), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.QarB_mu + kwargsDict['epsilon'] = args.QarB_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', QarBoostClassifier())]) + + +def genParamsDict(randomState): + return {"classifier__mu": CustomUniform(loc=.5, state=2, multiplier='e-'), + "classifier__epsilon": CustomRandint(low=1, high=15, multiplier='e-'), + "classifier__n_max_iterations": [None], + "classifier__random_state":[randomState]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array(detector.cv_results_['param_classifier__mu'])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + return getInterpretBase(classifier, directory, "QarBoost", classifier.weights_, classifier.break_cause) + diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC.py new file mode 100644 index 0000000000000000000000000000000000000000..e3600ff85aa875b44b114215b758598aca55fff2 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC.py @@ -0,0 +1,403 @@ +import scipy +import logging +import numpy as np +import numpy.ma as ma +from collections import defaultdict +import math +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score +import time +import datetime + +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint +from ..Monoview.BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, getInterpretBase + + +class ColumnGenerationClassifierQarNC(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None, random_state=42): + super(ColumnGenerationClassifierQarNC, self).__init__() + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + self.random_state = random_state + + def fit(self, X, y): + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=False) + + y[y == 0] = -1 + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + + + self.weights_ = [] + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) + + # Initialization + + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + self.example_weights = self._initialize_alphas(m).reshape((m,1)) + + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.edge_scores = [] + self.epsilons = [] + self.example_weights_ = [self.example_weights] + self.train_accuracies = [] + self.previous_votes = [] + + self.n_total_hypotheses_ = n + self.n_total_examples = m + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # To choose the first voter, we select the one that has the best margin. + if k == 0: + first_voter_index = self._find_best_margin(y_kernel_matrix) + self.chosen_columns_.append(first_voter_index) + + self.previous_vote = self.classification_matrix[:, first_voter_index].reshape((m,1)) + self.weighted_sum = self.classification_matrix[:, first_voter_index].reshape((m,1)) + + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + self.q = math.log((1-epsilon)/epsilon) + self.weights_.append(self.q) + + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + continue + + # Find best weak hypothesis given example_weights. Select the one that has the lowest minimum + # C-bound with the previous vote + sol, new_voter_index = self._find_new_voter(y_kernel_matrix, y) + if type(sol) == str: + self.break_cause = " no more hypothesis were able to improve the boosted vote." + break + + # Append the weak hypothesis. + self.chosen_columns_.append(new_voter_index) + self.weighted_sum = np.matmul(np.concatenate((self.previous_vote, self.classification_matrix[:, new_voter_index].reshape((m,1))), axis=1), + sol).reshape((m,1)) + + # Generate the new weight for the new voter + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + if epsilon == 0. or math.log((1 - epsilon) / epsilon) == math.inf: + self.chosen_columns_.pop() + self.break_cause = " epsilon was too small." + break + self.q = math.log((1 - epsilon) / epsilon) + self.weights_.append(self.q) + + # Update the distribution on the examples. + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + + # Update the "previous vote" to prepare for the next iteration + self.previous_vote = np.matmul(self.classification_matrix[:, self.chosen_columns_], + np.array(self.weights_).reshape((k + 1, 1))).reshape((m, 1)) + self.previous_votes.append(self.previous_vote) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + + self.nb_opposed_voters = self.check_opposed_voters() + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + self.weights_ = np.array(self.weights_) + + self.weights_/=np.sum(self.weights_) + y[y == -1] = 0 + + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + classification_matrix = self._binary_classification_matrix(X) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.weights_))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + end = time.time() + self.predict_time = end - start + return signs_array + + def _compute_epsilon(self,): + """Updating the \epsilon varaible""" + ones_matrix = np.zeros(self.weighted_sum.shape) + ones_matrix[self.weighted_sum < 0] = 1 + epsilon = (1.0/self.n_total_examples)*np.sum(self.example_weights*ones_matrix, axis=0) + return epsilon + + def _find_best_margin(self, y_kernel_matrix): + """Used only on the first iteration to select the voter with the largest margin""" + pseudo_h_values = ma.array(np.sum(y_kernel_matrix, axis=0), fill_value=-np.inf) + pseudo_h_values[self.fobidden_columns] = ma.masked + worst_h_index = ma.argmax(pseudo_h_values) + return worst_h_index + + def _find_new_voter(self, y_kernel_matrix, y): + """Here, we solve the two_voters_mincq_problem for each potential new voter, + and select the one that has the smallest minimum""" + c_borns = [] + possible_sols = [] + indices = [] + for hypothese_index, hypothese in enumerate(y_kernel_matrix.transpose()): + causes = [] + if hypothese_index not in self.chosen_columns_: + w = self._solve_two_weights_min_c(hypothese, y) + if w[0] != "break": + c_borns.append(self._cbound(w[0])) + possible_sols.append(w) + indices.append(hypothese_index) + else: + causes.append(w[1]) + if c_borns: + min_c_born_index = ma.argmin(c_borns) + selected_sol = possible_sols[min_c_born_index] + selected_voter_index = indices[min_c_born_index] + return selected_sol, selected_voter_index + else: + return "break", "smthng" + + def _update_example_weights(self, y): + new_weights = self.example_weights*np.exp(-self.q*y.reshape((self.n_total_examples, 1))*self.weighted_sum) + self.example_weights = new_weights/np.sum(new_weights) + + def _solve_two_weights_min_c(self, next_column, y): + """Here we solve the min C-bound problem for two voters and return the best 2-weights array""" + m = next_column.shape[0] + zero_diag = np.ones((m, m)) - np.identity(m) + + weighted_previous_sum = np.multiply(np.multiply(y.reshape((m, 1)), self.previous_vote.reshape((m, 1))), self.example_weights.reshape((m,1))) + weighted_next_column = np.multiply(next_column.reshape((m,1)), self.example_weights.reshape((m,1))) + + mat_prev = np.repeat(weighted_previous_sum, m, axis=1) * zero_diag + mat_next = np.repeat(weighted_next_column, m, axis=1) * zero_diag + + self.B2 = np.sum((weighted_previous_sum - weighted_next_column) ** 2) + self.B1 = np.sum(2 * weighted_next_column * (weighted_previous_sum - 2 * weighted_next_column * weighted_next_column)) + self.B0 = np.sum(weighted_next_column * weighted_next_column) + + self.A2 = self.B2 + np.sum((mat_prev - mat_next) * np.transpose(mat_prev - mat_next)) + self.A1 = self.B1 + np.sum(mat_prev * np.transpose(mat_next) - mat_next * np.transpose(mat_prev) - 2 * mat_next * np.transpose(mat_next)) + self.A0 = self.B0 + np.sum(mat_next * np.transpose(mat_next)) + C2 = (self.A1 * self.B2 - self.A2 * self.B1) + C1 = 2 * (self.A0 * self.B2 - self.A2 * self.B0) + C0 = self.A0 * self.B1 - self.A1 * self.B0 + + if C2 == 0: + if C1 == 0: + return np.array([0.5, 0.5]) + elif abs(C1) > 0: + return np.array([0., 1.]) + else: + return ['break', "the derivate was constant."] + elif C2 == 0: + return ["break", "the derivate was affine."] + try: + sols = np.roots(np.array([C2, C1, C0])) + except: + return ["break", "nan"] + + is_acceptable, sol = self._analyze_solutions(sols) + if is_acceptable: + return np.array([sol, 1-sol]) + else: + return ["break", sol] + + def _analyze_solutions(self, sols): + """"We just check that the solution found by np.roots is acceptable under our constraints + (real, a minimum and between 0 and 1)""" + for sol_index, sol in enumerate(sols): + if isinstance(sol, complex): + sols[sol_index] = -1 + if sols.shape[0] == 1: + if self._cbound(sols[0]) < self._cbound(sols[0] + 1): + best_sol = sols[0] + else: + return False, " the only solution was a maximum." + elif sols.shape[0] == 2: + best_sol = self._best_sol(sols) + else: + return False, " no solution were found." + + if 0 < best_sol < 1: + return True, self._best_sol(sols) + + elif best_sol <= 0: + return False, " the minimum was below 0." + else: + return False, " the minimum was over 1." + + def _cbound(self, sol): + """Computing the objective function""" + return 1 - (self.A2*sol**2 + self.A1*sol + self.A0)/(self.B2*sol**2 + self.B1*sol + self.B0) + + def _best_sol(self, sols): + values = np.array([self._cbound(sol) for sol in sols]) + return sols[np.argmin(values)] + + + +class QarBoostNCClassifier(ColumnGenerationClassifierQarNC): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None, random_state=42): + super(QarBoostNCClassifier, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each, random_state=random_state) + self.mu = mu + self.train_time = 0 + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class QarBoostNC(QarBoostNCClassifier): + + def __init__(self, random_state, **kwargs): + super(QarBoostNC, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + random_state = random_state) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', QarBoostNCClassifier())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + start =time.time() + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = QarBoostNCClassifier(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"], + random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time = end-start + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.uniform(1e-02, 10**(-0.5)), + "epsilon": 10**-randomState.randint(1, 15), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.QarBNC_mu + kwargsDict['epsilon'] = args.QarBNC_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', QarBoostNCClassifier())]) + + +def genParamsDict(randomState): + return {"classifier__mu": CustomUniform(loc=.5, state=2, multiplier='e-'), + "classifier__epsilon": CustomRandint(low=1, high=15, multiplier='e-'), + "classifier__n_max_iterations": [None], + "classifier__random_state":[randomState]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array(detector.cv_results_['param_classifier__mu'])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + return getInterpretBase(classifier, directory, "QarBoost", classifier.weights_, classifier.break_cause) + diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC2.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC2.py new file mode 100644 index 0000000000000000000000000000000000000000..a474b714ccec154d93e1137ed2f56b28a1e6197a --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC2.py @@ -0,0 +1,402 @@ +import scipy +import logging +import numpy as np +import numpy.ma as ma +from collections import defaultdict +import math +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score +import time +import datetime + +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint +from ..Monoview.BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, getInterpretBase + + +class ColumnGenerationClassifierQarNC2(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None, random_state=42): + super(ColumnGenerationClassifierQarNC2, self).__init__() + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + self.random_state = random_state + + def fit(self, X, y): + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=False) + + y[y == 0] = -1 + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + + + self.weights_ = [] + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) + + # Initialization + + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + self.example_weights = self._initialize_alphas(m).reshape((m,1)) + + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.edge_scores = [] + self.epsilons = [] + self.example_weights_ = [self.example_weights] + self.train_accuracies = [] + self.previous_votes = [] + + self.n_total_hypotheses_ = n + self.n_total_examples = m + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # To choose the first voter, we select the one that has the best margin. + if k == 0: + first_voter_index = self._find_best_margin(y_kernel_matrix) + self.chosen_columns_.append(first_voter_index) + + self.previous_vote = self.classification_matrix[:, first_voter_index].reshape((m,1)) + self.weighted_sum = self.classification_matrix[:, first_voter_index].reshape((m,1)) + + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + self.q = math.log((1-epsilon)/epsilon) + self.weights_.append(self.q) + + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + continue + + # Find best weak hypothesis given example_weights. Select the one that has the lowest minimum + # C-bound with the previous vote + sol, new_voter_index = self._find_new_voter(y_kernel_matrix, y) + if type(sol) == str: + self.break_cause = " no more hypothesis were able to improve the boosted vote." + break + + # Append the weak hypothesis. + self.chosen_columns_.append(new_voter_index) + self.weighted_sum = np.matmul(np.concatenate((self.previous_vote, self.classification_matrix[:, new_voter_index].reshape((m,1))), axis=1), + sol).reshape((m,1)) + + # Generate the new weight for the new voter + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + if epsilon == 0. or math.log((1 - epsilon) / epsilon) == math.inf: + self.chosen_columns_.pop() + self.break_cause = " epsilon was too small." + break + self.q = math.log((1 - epsilon) / epsilon) + self.weights_.append(self.q) + + # Update the distribution on the examples. + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + + # Update the "previous vote" to prepare for the next iteration + self.previous_vote = np.matmul(self.classification_matrix[:, self.chosen_columns_], + np.array(self.weights_).reshape((k + 1, 1))).reshape((m, 1)) + self.previous_votes.append(self.previous_vote) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + + self.nb_opposed_voters = self.check_opposed_voters() + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + self.weights_ = np.array(self.weights_) + + self.weights_/=np.sum(self.weights_) + y[y == -1] = 0 + + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + classification_matrix = self._binary_classification_matrix(X) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.weights_))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + end = time.time() + self.predict_time = end - start + return signs_array + + def _compute_epsilon(self,): + """Updating the \epsilon varaible""" + ones_matrix = np.zeros(self.weighted_sum.shape) + ones_matrix[self.weighted_sum < 0] = 1 + epsilon = (1.0/self.n_total_examples)*np.sum(self.example_weights*ones_matrix, axis=0) + return epsilon + + def _find_best_margin(self, y_kernel_matrix): + """Used only on the first iteration to select the voter with the largest margin""" + pseudo_h_values = ma.array(np.sum(y_kernel_matrix, axis=0), fill_value=-np.inf) + pseudo_h_values[self.fobidden_columns] = ma.masked + worst_h_index = ma.argmax(pseudo_h_values) + return worst_h_index + + def _find_new_voter(self, y_kernel_matrix, y): + """Here, we solve the two_voters_mincq_problem for each potential new voter, + and select the one that has the smallest minimum""" + c_borns = [] + possible_sols = [] + indices = [] + for hypothese_index, hypothese in enumerate(y_kernel_matrix.transpose()): + causes = [] + w = self._solve_two_weights_min_c(hypothese, y) + if w[0] != "break": + c_borns.append(self._cbound(w[0])) + possible_sols.append(w) + indices.append(hypothese_index) + else: + causes.append(w[1]) + if c_borns: + min_c_born_index = ma.argmin(c_borns) + selected_sol = possible_sols[min_c_born_index] + selected_voter_index = indices[min_c_born_index] + return selected_sol, selected_voter_index + else: + return "break", "smthng" + + def _update_example_weights(self, y): + new_weights = self.example_weights*np.exp(-self.q*y.reshape((self.n_total_examples, 1))*self.weighted_sum) + self.example_weights = new_weights/np.sum(new_weights) + + def _solve_two_weights_min_c(self, next_column, y): + """Here we solve the min C-bound problem for two voters and return the best 2-weights array""" + m = next_column.shape[0] + zero_diag = np.ones((m, m)) - np.identity(m) + + weighted_previous_sum = np.multiply(np.multiply(y.reshape((m, 1)), self.previous_vote.reshape((m, 1))), self.example_weights.reshape((m,1))) + weighted_next_column = np.multiply(next_column.reshape((m,1)), self.example_weights.reshape((m,1))) + + mat_prev = np.repeat(weighted_previous_sum, m, axis=1) * zero_diag + mat_next = np.repeat(weighted_next_column, m, axis=1) * zero_diag + + self.B2 = np.sum((weighted_previous_sum - weighted_next_column) ** 2) + self.B1 = np.sum(2 * weighted_next_column * (weighted_previous_sum - 2 * weighted_next_column * weighted_next_column)) + self.B0 = np.sum(weighted_next_column * weighted_next_column) + + self.A2 = self.B2 + np.sum((mat_prev - mat_next) * np.transpose(mat_prev - mat_next)) + self.A1 = self.B1 + np.sum(mat_prev * np.transpose(mat_next) - mat_next * np.transpose(mat_prev) - 2 * mat_next * np.transpose(mat_next)) + self.A0 = self.B0 + np.sum(mat_next * np.transpose(mat_next)) + C2 = (self.A1 * self.B2 - self.A2 * self.B1) + C1 = 2 * (self.A0 * self.B2 - self.A2 * self.B0) + C0 = self.A0 * self.B1 - self.A1 * self.B0 + + if C2 == 0: + if C1 == 0: + return np.array([0.5, 0.5]) + elif abs(C1) > 0: + return np.array([0., 1.]) + else: + return ['break', "the derivate was constant."] + elif C2 == 0: + return ["break", "the derivate was affine."] + try: + sols = np.roots(np.array([C2, C1, C0])) + except: + return ["break", "nan"] + + is_acceptable, sol = self._analyze_solutions(sols) + if is_acceptable: + return np.array([sol, 1-sol]) + else: + return ["break", sol] + + def _analyze_solutions(self, sols): + """"We just check that the solution found by np.roots is acceptable under our constraints + (real, a minimum and between 0 and 1)""" + for sol_index, sol in enumerate(sols): + if isinstance(sol, complex): + sols[sol_index] = -1 + if sols.shape[0] == 1: + if self._cbound(sols[0]) < self._cbound(sols[0] + 1): + best_sol = sols[0] + else: + return False, " the only solution was a maximum." + elif sols.shape[0] == 2: + best_sol = self._best_sol(sols) + else: + return False, " no solution were found." + + if 0 < best_sol < 1: + return True, self._best_sol(sols) + + elif best_sol <= 0: + return False, " the minimum was below 0." + else: + return False, " the minimum was over 1." + + def _cbound(self, sol): + """Computing the objective function""" + return 1 - (self.A2*sol**2 + self.A1*sol + self.A0)/(self.B2*sol**2 + self.B1*sol + self.B0) + + def _best_sol(self, sols): + values = np.array([self._cbound(sol) for sol in sols]) + return sols[np.argmin(values)] + + + +class QarBoostNC2Classifier(ColumnGenerationClassifierQarNC2): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None, random_state=42): + super(QarBoostNC2Classifier, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each, random_state=random_state) + self.mu = mu + self.train_time = 0 + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class QarBoostNC2(QarBoostNC2Classifier): + + def __init__(self, random_state, **kwargs): + super(QarBoostNC2, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + random_state = random_state) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', QarBoostNC2Classifier())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + start =time.time() + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = QarBoostNC2Classifier(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"], + random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time = end-start + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.uniform(1e-02, 10**(-0.5)), + "epsilon": 10**-randomState.randint(1, 15), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.QarBNC2_mu + kwargsDict['epsilon'] = args.QarBNC2_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', QarBoostNC2Classifier())]) + + +def genParamsDict(randomState): + return {"classifier__mu": CustomUniform(loc=.5, state=2, multiplier='e-'), + "classifier__epsilon": CustomRandint(low=1, high=15, multiplier='e-'), + "classifier__n_max_iterations": [None], + "classifier__random_state":[randomState]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array(detector.cv_results_['param_classifier__mu'])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + return getInterpretBase(classifier, directory, "QarBoost", classifier.weights_, classifier.break_cause) + diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv2.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv2.py new file mode 100644 index 0000000000000000000000000000000000000000..c68094b72169745ff6bde65f4ad68a0955bded57 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv2.py @@ -0,0 +1,403 @@ +import scipy +import logging +import numpy as np +import numpy.ma as ma +from collections import defaultdict +import math +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score +import time +import datetime + +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint +from..Monoview.BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, getInterpretBase + + +class ColumnGenerationClassifierQar2(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None, random_state=42): + super(ColumnGenerationClassifierQar2, self).__init__() + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + self.random_state = random_state + + def fit(self, X, y): + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=True) + + y[y == 0] = -1 + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + + + self.weights_ = [] + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) + + # Initialization + + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + self.example_weights = self._initialize_alphas(m).reshape((m,1)) + + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.edge_scores = [] + self.epsilons = [] + self.example_weights_ = [self.example_weights] + self.train_accuracies = [] + self.previous_votes = [] + + self.n_total_hypotheses_ = n + self.n_total_examples = m + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # To choose the first voter, we select the one that has the best margin. + if k == 0: + first_voter_index = self._find_best_margin(y_kernel_matrix) + self.chosen_columns_.append(first_voter_index) + + self.previous_vote = self.classification_matrix[:, first_voter_index].reshape((m,1)) + self.weighted_sum = self.classification_matrix[:, first_voter_index].reshape((m,1)) + + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + self.q = math.log((1-epsilon)/epsilon) + self.weights_.append(self.q) + + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + continue + + # Find best weak hypothesis given example_weights. Select the one that has the lowest minimum + # C-bound with the previous vote + sol, new_voter_index = self._find_new_voter(y_kernel_matrix, y) + if type(sol) == str: + self.break_cause = " no more hypothesis were able to improve the boosted vote." + break + + # Append the weak hypothesis. + self.chosen_columns_.append(new_voter_index) + self.weighted_sum = np.matmul(np.concatenate((self.previous_vote, self.classification_matrix[:, new_voter_index].reshape((m,1))), axis=1), + sol).reshape((m,1)) + + # Generate the new weight for the new voter + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + if epsilon == 0. or math.log((1 - epsilon) / epsilon) == math.inf: + self.chosen_columns_.pop() + self.break_cause = " epsilon was too small." + break + self.q = math.log((1 - epsilon) / epsilon) + self.weights_.append(self.q) + + # Update the distribution on the examples. + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + + # Update the "previous vote" to prepare for the next iteration + self.previous_vote = np.matmul(self.classification_matrix[:, self.chosen_columns_], + np.array(self.weights_).reshape((k + 1, 1))).reshape((m, 1)) + + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + + self.nb_opposed_voters = self.check_opposed_voters() + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + self.weights_ = np.array(self.weights_) + + self.weights_/=np.sum(self.weights_) + y[y == -1] = 0 + + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + classification_matrix = self._binary_classification_matrix(X) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.weights_))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + end = time.time() + self.predict_time = end-start + return signs_array + + def _compute_epsilon(self,): + """Updating the \epsilon varaible""" + ones_matrix = np.zeros(self.weighted_sum.shape) + ones_matrix[self.weighted_sum < 0] = 1 + epsilon = (1.0/self.n_total_examples)*np.sum(self.example_weights*ones_matrix, axis=0) + return epsilon + + def _find_best_margin(self, y_kernel_matrix): + """Used only on the first iteration to select the voter with the largest margin""" + pseudo_h_values = ma.array(np.sum(y_kernel_matrix, axis=0), fill_value=-np.inf) + pseudo_h_values[self.fobidden_columns] = ma.masked + worst_h_index = ma.argmax(pseudo_h_values) + return worst_h_index + + def _find_new_voter(self, y_kernel_matrix, y): + """Here, we solve the two_voters_mincq_problem for each potential new voter, + and select the one that has the smallest minimum""" + c_borns = [] + possible_sols = [] + indices = [] + for hypothese_index, hypothese in enumerate(y_kernel_matrix.transpose()): + causes = [] + w = self._solve_two_weights_min_c(hypothese, y) + if w[0] != "break": + c_borns.append(self._cbound(w[0])) + possible_sols.append(w) + indices.append(hypothese_index) + else: + causes.append(w[1]) + if c_borns: + min_c_born_index = ma.argmin(c_borns) + selected_sol = possible_sols[min_c_born_index] + selected_voter_index = indices[min_c_born_index] + return selected_sol, selected_voter_index + else: + return "break", "smthng" + + def _update_example_weights(self, y): + new_weights = self.example_weights*np.exp(-self.q*y.reshape((self.n_total_examples, 1))*self.weighted_sum) + self.example_weights = new_weights/np.sum(new_weights) + + def _solve_two_weights_min_c(self, next_column, y): + """Here we solve the min C-bound problem for two voters and return the best 2-weights array""" + m = next_column.shape[0] + zero_diag = np.ones((m, m)) - np.identity(m) + + weighted_previous_sum = np.multiply(np.multiply(y.reshape((m, 1)), self.previous_vote.reshape((m, 1))), self.example_weights.reshape((m,1))) + weighted_next_column = np.multiply(next_column.reshape((m,1)), self.example_weights.reshape((m,1))) + + mat_prev = np.repeat(weighted_previous_sum, m, axis=1) * zero_diag + mat_next = np.repeat(weighted_next_column, m, axis=1) * zero_diag + + self.B2 = np.sum((weighted_previous_sum - weighted_next_column) ** 2) + self.B1 = np.sum(2 * weighted_next_column * (weighted_previous_sum - 2 * weighted_next_column * weighted_next_column)) + self.B0 = np.sum(weighted_next_column * weighted_next_column) + + self.A2 = self.B2 + np.sum((mat_prev - mat_next) * np.transpose(mat_prev - mat_next)) + self.A1 = self.B1 + np.sum(mat_prev * np.transpose(mat_next) - mat_next * np.transpose(mat_prev) - 2 * mat_next * np.transpose(mat_next)) + self.A0 = self.B0 + np.sum(mat_next * np.transpose(mat_next)) + C2 = (self.A1 * self.B2 - self.A2 * self.B1) + C1 = 2 * (self.A0 * self.B2 - self.A2 * self.B0) + C0 = self.A0 * self.B1 - self.A1 * self.B0 + + if C2 == 0: + if C1 == 0: + return np.array([0.5, 0.5]) + elif abs(C1) > 0: + return np.array([0., 1.]) + else: + return ['break', "the derivate was constant."] + elif C2 == 0: + return ["break", "the derivate was affine."] + try: + sols = np.roots(np.array([C2, C1, C0])) + except: + return ["break", "nan"] + + is_acceptable, sol = self._analyze_solutions(sols) + if is_acceptable: + return np.array([sol, 1-sol]) + else: + return ["break", sol] + + def _analyze_solutions(self, sols): + """"We just check that the solution found by np.roots is acceptable under our constraints + (real, a minimum and between 0 and 1)""" + for sol_index, sol in enumerate(sols): + if isinstance(sol, complex): + sols[sol_index] = -1 + if sols.shape[0] == 1: + if self._cbound(sols[0]) < self._cbound(sols[0] + 1): + best_sol = sols[0] + else: + return False, " the only solution was a maximum." + elif sols.shape[0] == 2: + best_sol = self._best_sol(sols) + else: + return False, " no solution were found." + + if 0 < best_sol < 1: + return True, self._best_sol(sols) + + elif best_sol <= 0: + return False, " the minimum was below 0." + else: + return False, " the minimum was over 1." + + def _cbound(self, sol): + """Computing the objective function""" + return 1 - (self.A2*sol**2 + self.A1*sol + self.A0)/(self.B2*sol**2 + self.B1*sol + self.B0) + + def _best_sol(self, sols): + values = np.array([self._cbound(sol) for sol in sols]) + return sols[np.argmin(values)] + + +class QarBoostClassifierv2(ColumnGenerationClassifierQar2): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None, random_state=42): + super(QarBoostClassifierv2, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each, random_state=random_state) + self.mu = mu + self.train_time = 0 + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class QarBoostv2(QarBoostClassifierv2): + + def __init__(self, random_state, **kwargs): + super(QarBoostv2, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + random_state = random_state) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', QarBoostClassifierv2())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + start =time.time() + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = QarBoostClassifierv2(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"], + random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time = end-start + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.uniform(1e-02, 10**(-0.5)), + "epsilon": 10**-randomState.randint(1, 15), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.QarB2_mu + kwargsDict['epsilon'] = args.QarB2_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', QarBoostClassifierv2())]) + + +def genParamsDict(randomState): + return {"classifier__mu": CustomUniform(loc=.5, state=2, multiplier='e-'), + "classifier__epsilon": CustomRandint(low=1, high=15, multiplier='e-'), + "classifier__n_max_iterations": [None], + "classifier__random_state":[randomState]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array(detector.cv_results_['param_classifier__mu'])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + break_cause = " of multiple reasons." + return getInterpretBase(classifier, directory, "QarBoostv2", classifier.weights_, break_cause) + diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv3.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv3.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e9b0f1ea5e38aae1c962e35c6ebf48996d8fc3 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv3.py @@ -0,0 +1,404 @@ +import scipy +import logging +import numpy as np +import numpy.ma as ma +from collections import defaultdict +import math +from sklearn.utils.validation import check_is_fitted +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.metrics import accuracy_score +import time +import datetime + +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint +from ..Monoview.BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, getInterpretBase + + +class ColumnGenerationClassifierQar3(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None, random_state=42): + super(ColumnGenerationClassifierQar3, self).__init__() + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + self.random_state = random_state + + def fit(self, X, y): + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=True) + + y[y == 0] = -1 + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + + + self.weights_ = [] + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) + + # Initialization + + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + self.example_weights = self._initialize_alphas(m).reshape((m,1)) + + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.edge_scores = [] + self.epsilons = [] + self.example_weights_ = [self.example_weights] + self.train_accuracies = [] + self.previous_votes = [] + + self.n_total_hypotheses_ = n + self.n_total_examples = m + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # To choose the first voter, we select the one that has the best margin. + if k == 0: + first_voter_index = self._find_best_margin(y_kernel_matrix) + self.chosen_columns_.append(first_voter_index) + + self.previous_vote = self.classification_matrix[:, first_voter_index].reshape((m,1)) + self.previous_votes.append(self.previous_vote) + self.new_voter = self.classification_matrix[:, first_voter_index].reshape((m,1)) + + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + self.q = math.log((1-epsilon)/epsilon) + self.weights_.append(self.q) + + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + continue + + # Find best weak hypothesis given example_weights. Select the one that has the lowest minimum + # C-bound with the previous vote + sol, new_voter_index = self._find_new_voter(y_kernel_matrix, y) + if type(sol) == str: + self.break_cause = " no more hypothesis were able to improve the boosted vote." + break + + # Append the weak hypothesis. + self.chosen_columns_.append(new_voter_index) + # self.weighted_sum = np.matmul(np.concatenate((self.previous_vote, self.classification_matrix[:, new_voter_index].reshape((m,1))), axis=1), + # sol).reshape((m,1)) + self.new_voter = self.classification_matrix[:, new_voter_index].reshape((m,1)) + + # Generate the new weight for the new voter + epsilon = self._compute_epsilon() + self.epsilons.append(epsilon) + if epsilon == 0. or math.log((1 - epsilon) / epsilon) == math.inf: + self.chosen_columns_.pop() + self.break_cause = " epsilon was too small." + break + self.q = math.log((1 - epsilon) / epsilon) + self.weights_.append(self.q) + + # Update the distribution on the examples. + self._update_example_weights(y) + self.example_weights_.append(self.example_weights) + + # Update the "previous vote" to prepare for the next iteration + self.previous_vote = np.matmul(self.classification_matrix[:, self.chosen_columns_], + np.array(self.weights_).reshape((k + 1, 1))).reshape((m, 1)) + self.previous_votes.append(self.previous_vote) + self.train_accuracies.append(accuracy_score(y, np.sign(self.previous_vote))) + + self.nb_opposed_voters = self.check_opposed_voters() + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + self.weights_ = np.array(self.weights_) + + self.weights_/=np.sum(self.weights_) + y[y == -1] = 0 + + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + classification_matrix = self._binary_classification_matrix(X) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.weights_))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1 ] = 0 + end = time.time() + self.predict_time = end-start + return signs_array + + + def _compute_epsilon(self,): + """Updating the \epsilon varaible""" + ones_matrix = np.zeros(self.new_voter.shape) + ones_matrix[self.new_voter < 0] = 1 + epsilon = (1.0/self.n_total_examples)*np.sum(self.example_weights*ones_matrix, axis=0) + return epsilon + + def _find_best_margin(self, y_kernel_matrix): + """Used only on the first iteration to select the voter with the largest margin""" + pseudo_h_values = ma.array(np.sum(y_kernel_matrix, axis=0), fill_value=-np.inf) + pseudo_h_values[self.fobidden_columns] = ma.masked + worst_h_index = ma.argmax(pseudo_h_values) + return worst_h_index + + def _find_new_voter(self, y_kernel_matrix, y): + """Here, we solve the two_voters_mincq_problem for each potential new voter, + and select the one that has the smallest minimum""" + c_borns = [] + possible_sols = [] + indices = [] + for hypothese_index, hypothese in enumerate(y_kernel_matrix.transpose()): + causes = [] + if hypothese_index not in self.chosen_columns_: + w = self._solve_two_weights_min_c(hypothese, y) + if w[0] != "break": + c_borns.append(self._cbound(w[0])) + possible_sols.append(w) + indices.append(hypothese_index) + else: + causes.append(w[1]) + if c_borns: + min_c_born_index = ma.argmin(c_borns) + selected_sol = possible_sols[min_c_born_index] + selected_voter_index = indices[min_c_born_index] + return selected_sol, selected_voter_index + else: + return "break", "smthng" + + def _update_example_weights(self, y): + new_weights = self.example_weights*np.exp(-self.q*y.reshape((self.n_total_examples, 1))*self.new_voter) + self.example_weights = new_weights/np.sum(new_weights) + + def _solve_two_weights_min_c(self, next_column, y): + """Here we solve the min C-bound problem for two voters and return the best 2-weights array""" + m = next_column.shape[0] + zero_diag = np.ones((m, m)) - np.identity(m) + + weighted_previous_sum = np.multiply(np.multiply(y.reshape((m, 1)), self.previous_vote.reshape((m, 1))), self.example_weights.reshape((m,1))) + weighted_next_column = np.multiply(next_column.reshape((m,1)), self.example_weights.reshape((m,1))) + + mat_prev = np.repeat(weighted_previous_sum, m, axis=1) * zero_diag + mat_next = np.repeat(weighted_next_column, m, axis=1) * zero_diag + + self.B2 = np.sum((weighted_previous_sum - weighted_next_column) ** 2) + self.B1 = np.sum(2 * weighted_next_column * (weighted_previous_sum - 2 * weighted_next_column * weighted_next_column)) + self.B0 = np.sum(weighted_next_column * weighted_next_column) + + self.A2 = self.B2 + np.sum((mat_prev - mat_next) * np.transpose(mat_prev - mat_next)) + self.A1 = self.B1 + np.sum(mat_prev * np.transpose(mat_next) - mat_next * np.transpose(mat_prev) - 2 * mat_next * np.transpose(mat_next)) + self.A0 = self.B0 + np.sum(mat_next * np.transpose(mat_next)) + C2 = (self.A1 * self.B2 - self.A2 * self.B1) + C1 = 2 * (self.A0 * self.B2 - self.A2 * self.B0) + C0 = self.A0 * self.B1 - self.A1 * self.B0 + + if C2 == 0: + if C1 == 0: + return np.array([0.5, 0.5]) + elif abs(C1) > 0: + return np.array([0., 1.]) + else: + return ['break', "the derivate was constant."] + elif C2 == 0: + return ["break", "the derivate was affine."] + try: + sols = np.roots(np.array([C2, C1, C0])) + except: + return ["break", "nan"] + + is_acceptable, sol = self._analyze_solutions(sols) + if is_acceptable: + return np.array([sol, 1-sol]) + else: + return ["break", sol] + + def _analyze_solutions(self, sols): + """"We just check that the solution found by np.roots is acceptable under our constraints + (real, a minimum and between 0 and 1)""" + for sol_index, sol in enumerate(sols): + if isinstance(sol, complex): + sols[sol_index] = -1 + if sols.shape[0] == 1: + if self._cbound(sols[0]) < self._cbound(sols[0] + 1): + best_sol = sols[0] + else: + return False, " the only solution was a maximum." + elif sols.shape[0] == 2: + best_sol = self._best_sol(sols) + else: + return False, " no solution were found." + + if 0 < best_sol < 1: + return True, self._best_sol(sols) + + elif best_sol <= 0: + return False, " the minimum was below 0." + else: + return False, " the minimum was over 1." + + def _cbound(self, sol): + """Computing the objective function""" + return 1 - (self.A2*sol**2 + self.A1*sol + self.A0)/(self.B2*sol**2 + self.B1*sol + self.B0) + + def _best_sol(self, sols): + values = np.array([self._cbound(sol) for sol in sols]) + return sols[np.argmin(values)] + + +class QarBoostClassifier3(ColumnGenerationClassifierQar3): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None, random_state=42): + super(QarBoostClassifier3, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each, random_state=random_state) + self.mu = mu + self.train_time = 0 + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class QarBoostv3(QarBoostClassifier3): + + def __init__(self, random_state, **kwargs): + super(QarBoostv3, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + random_state = random_state) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', QarBoostClassifier3())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + start =time.time() + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = QarBoostClassifier3(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"], + random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + end = time.time() + classifier.train_time = end-start + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.uniform(1e-02, 10**(-0.5)), + "epsilon": 10**-randomState.randint(1, 15), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.QarB3_mu + kwargsDict['epsilon'] = args.QarB3_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', QarBoostClassifier3())]) + + +def genParamsDict(randomState): + return {"classifier__mu": CustomUniform(loc=.5, state=2, multiplier='e-'), + "classifier__epsilon": CustomRandint(low=1, high=15, multiplier='e-'), + "classifier__n_max_iterations": [None], + "classifier__random_state":[randomState]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array(detector.cv_results_['param_classifier__mu'])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- QarBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- QarBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + return getInterpretBase(classifier, directory, "QarBoostv3", classifier.weights_, classifier.break_cause) diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index 811758e53032155bf8737ff3aa72cda64376d58b..961091a21b401c1a12111f66544a310c5d2b9fd3 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -41,6 +41,8 @@ def parseTheArgs(arguments): groupStandard.add_argument('--machine', metavar='STRING', action='store', help='Type of machine on which the script runs', default="PC") groupStandard.add_argument('-full', action='store_true', help='Use option to use full dataset and no labels or view filtering') + groupStandard.add_argument('-debug', action='store_true', + help='Use option to bebug implemented algorithms') groupClass = parser.add_argument_group('Classification arguments') @@ -164,6 +166,37 @@ def parseTheArgs(arguments): groupCQBoostv21.add_argument('--CQB21_epsilon', metavar='FLOAT', type=float, action='store', help='Set the epsilon parameter for CQBoostv2', default=1e-08) + groupQarBoost = parser.add_argument_group('QarBoost arguments') + groupQarBoost.add_argument('--QarB_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for QarBoost', default=0.001) + groupQarBoost.add_argument('--QarB_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for QarBoost', default=1e-08) + + groupQarBoostv2 = parser.add_argument_group('QarBoostv2 arguments') + groupQarBoostv2.add_argument('--QarB2_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for QarBoostv2', default=0.001) + groupQarBoostv2.add_argument('--QarB2_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for QarBoostv2', default=1e-08) + + groupQarBoostv3 = parser.add_argument_group('QarBoostv3 arguments') + groupQarBoostv3.add_argument('--QarB3_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for QarBoostv3', default=0.001) + groupQarBoostv3.add_argument('--QarB3_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for QarBoostv3', default=1e-08) + + groupQarBoostNC = parser.add_argument_group('QarBoostNC arguments') + groupQarBoostNC.add_argument('--QarBNC_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for QarBoostNC', default=0.001) + groupQarBoostNC.add_argument('--QarBNC_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for QarBoostNC', default=1e-08) + + groupQarBoostNC2 = parser.add_argument_group('QarBoostNC2 arguments') + groupQarBoostNC2.add_argument('--QarBNC2_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for QarBoostNC2', default=0.001) + groupQarBoostNC2.add_argument('--QarBNC2_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for QarBoostNC2', default=1e-08) + + groupMumbo = parser.add_argument_group('Mumbo arguments') groupMumbo.add_argument('--MU_types', metavar='STRING', action='store', nargs="+", help='Determine which monoview classifier to use with Mumbo', @@ -322,7 +355,7 @@ def getDatabaseFunction(name, type): return getDatabase -def initLogFile(name, views, CL_type, log): +def initLogFile(name, views, CL_type, log, debug): r"""Used to init the directory where the preds will be stored and the log file. First this function will check if the result directory already exists (only one per minute is allowed). @@ -345,7 +378,10 @@ def initLogFile(name, views, CL_type, log): resultsDirectory : string Reference to the main results directory for the benchmark. """ - resultDirectory = "../Results/" + name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/" + if debug: + resultDirectory = "../Results/" + name + "/debug_started_" + time.strftime("%Y_%m_%d-%H_%M_%S") + "/" + else: + resultDirectory = "../Results/" + name + "/started_" + time.strftime("%Y_%m_%d-%H_%M") + "/" logFileName = time.strftime("%Y_%m_%d-%H_%M") + "-" + ''.join(CL_type) + "-" + "_".join( views) + "-" + name + "-LOG" if os.path.exists(os.path.dirname(resultDirectory)):