From 8006699437845f7bd20b9b409e86f2fb39ba658e Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Tue, 5 May 2020 10:15:39 -0400 Subject: [PATCH] Added classifiers --- .../additions/BoostUtils.py | 946 ++++++++++++++++++ .../additions/CBBoostUtils.py | 532 ++++++++++ .../additions/CQBoostUtils.py | 335 +++++++ .../additions/MinCQUtils.py | 321 ++++++ .../monoview_classifiers/cb_boost.py | 102 ++ .../monoview_classifiers/cq_boost.py | 76 ++ .../monoview_classifiers/imbalance_bagging.py | 29 + .../monoview_classifiers/scm.py | 93 ++ .../additions/kernel_learning.py | 103 ++ .../multiview_classifiers/lp_norm_mkl.py | 40 + .../multiview_classifiers/mucombo.py | 48 + .../multiview_classifiers/mumbo.py | 105 ++ .../multiview_classifiers/mvml.py | 522 ++++++++++ 13 files changed, 3252 insertions(+) create mode 100644 summit/multiview_platform/monoview_classifiers/additions/BoostUtils.py create mode 100644 summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py create mode 100644 summit/multiview_platform/monoview_classifiers/additions/CQBoostUtils.py create mode 100644 summit/multiview_platform/monoview_classifiers/additions/MinCQUtils.py create mode 100644 summit/multiview_platform/monoview_classifiers/cb_boost.py create mode 100644 summit/multiview_platform/monoview_classifiers/cq_boost.py create mode 100644 summit/multiview_platform/monoview_classifiers/imbalance_bagging.py create mode 100644 summit/multiview_platform/monoview_classifiers/scm.py create mode 100644 summit/multiview_platform/multiview_classifiers/additions/kernel_learning.py create mode 100644 summit/multiview_platform/multiview_classifiers/lp_norm_mkl.py create mode 100644 summit/multiview_platform/multiview_classifiers/mucombo.py create mode 100644 summit/multiview_platform/multiview_classifiers/mumbo.py create mode 100644 summit/multiview_platform/multiview_classifiers/mvml.py diff --git a/summit/multiview_platform/monoview_classifiers/additions/BoostUtils.py b/summit/multiview_platform/monoview_classifiers/additions/BoostUtils.py new file mode 100644 index 00000000..707eb66f --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/additions/BoostUtils.py @@ -0,0 +1,946 @@ +import datetime +import sys + +import matplotlib.pyplot as plt +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from sklearn.preprocessing import LabelEncoder +from sklearn.tree import DecisionTreeClassifier +from sklearn.utils.validation import check_is_fitted + + +class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): + """Generic Attribute Threshold Binary Classifier + + Attributes + ---------- + attribute_index : int + The attribute to consider for the classification. + threshold : float + The threshold value for classification rule. + direction : int, optional + A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump + will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). + + """ + + def __init__(self, attribute_index, threshold, direction=1): + super(DecisionStumpClassifier, self).__init__() + self.attribute_index = attribute_index + self.threshold = threshold + self.direction = direction + + def fit(self, X, y): + # Only verify that we are in the binary classification setting, with support for transductive learning. + if isinstance(y, np.ma.MaskedArray): + self.classes_ = np.unique(y[np.logical_not(y.mask)]) + else: + self.classes_ = np.unique(y) + + # This label encoder is there for the predict function to be able to return any two classes that were used + # when fitting, for example {-1, 1} or {0, 1}. + self.le_ = LabelEncoder() + self.le_.fit(self.classes_) + self.classes_ = self.le_.classes_ + + if not len(self.classes_) == 2: + raise ValueError( + 'DecisionStumpsVoter only supports binary classification') + # assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" + return self + + def predict(self, X): + """Returns the output of the classifier, on a sample X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + predictions : array-like, shape = [n_samples] + Predicted class labels. + + """ + check_is_fitted(self, 'classes_') + return self.le_.inverse_transform( + np.argmax(self.predict_proba(X), axis=1)) + + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + avg : array-like, shape = [n_samples, n_classes] + Weighted average probability for each class per sample. + + """ + check_is_fitted(self, 'classes_') + X = np.asarray(X) + probas = np.zeros((X.shape[0], 2)) + positive_class = np.argwhere( + X[:, self.attribute_index] > self.threshold) + negative_class = np.setdiff1d(range(X.shape[0]), positive_class) + probas[positive_class, 1] = 1.0 + probas[negative_class, 0] = 1.0 + + if self.direction == -1: + probas = 1 - probas + + return probas + + def predict_proba_t(self, X): + """Compute probabilities of possible outcomes for samples in X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + avg : array-like, shape = [n_samples, n_classes] + Weighted average probability for each class per sample. + + """ + + X = np.ones(X.shape) + check_is_fitted(self, 'classes_') + X = np.asarray(X) + probas = np.zeros((X.shape[0], 2)) + positive_class = np.argwhere( + X[:, self.attribute_index] > self.threshold) + negative_class = np.setdiff1d(range(X.shape[0]), positive_class) + probas[positive_class, 1] = 1.0 + probas[negative_class, 0] = 1.0 + + if self.direction == -1: + probas = 1 - probas + + return probas + + def reverse_decision(self): + self.direction *= -1 + + +class ClassifiersGenerator(BaseEstimator, TransformerMixin): + """Base class to create a set of voters using training samples, and then transform a set of examples in + the voters' output space. + + Attributes + ---------- + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + voters : ndarray of voter functions + Once fit, contains the voter functions. + + """ + + def __init__(self, self_complemented=False): + super(ClassifiersGenerator, self).__init__() + self.self_complemented = self_complemented + + def fit(self, X, y=None): + """Generates the voters using training samples. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Input labels, usually determines the decision polarity of each voter. + + Returns + ------- + self + + """ + raise NotImplementedError + + def transform(self, X): + """Transforms the input points in a matrix of classification, using previously learned voters. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data to classify. + + Returns + ------- + ndarray of shape (n_samples, n_voters) + The voters' decision on each example. + + """ + check_is_fitted(self, 'estimators_') + return np.array([voter.predict(X) for voter in self.estimators_]).T + + +# class TreesClassifiersGenerator(ClassifiersGenerator): +# """A generator to widen the voter's pool of our boosting algorithms. +# """ +# +# def __init__(self, n_stumps_per_attribute=10, self_complemented=False, check_diff=True, max_depth=3): +# super(TreesClassifiersGenerator, self).__init__(self_complemented) +# self.n_stumps_per_attribute = n_stumps_per_attribute +# self.check_diff = check_diff +# self.max_depth = max_depth +# +# def fit(self, X, y=None): + +class TreeClassifiersGenerator(ClassifiersGenerator): + + def __init__(self, random_state=42, max_depth=2, self_complemented=True, + criterion="gini", splitter="best", n_trees=100, + distribution_type="uniform", low=0, high=10, + attributes_ratio=0.6, examples_ratio=0.95): + super(TreeClassifiersGenerator, self).__init__(self_complemented) + self.max_depth = max_depth + self.criterion = criterion + self.splitter = splitter + self.n_trees = n_trees + if type(random_state) is int: + self.random_state = np.random.RandomState(random_state) + else: + self.random_state = random_state + self.distribution_type = distribution_type + self.low = low + self.high = high + self.attributes_ratio = attributes_ratio + self.examples_ratio = examples_ratio + + def fit(self, X, y=None): + estimators_ = [] + self.attribute_indices = np.array( + [self.sub_sample_attributes(X) for _ in range(self.n_trees)]) + self.example_indices = np.array( + [self.sub_sample_examples(X) for _ in range(self.n_trees)]) + for i in range(self.n_trees): + estimators_.append(DecisionTreeClassifier(criterion=self.criterion, + splitter=self.splitter, + max_depth=self.max_depth).fit( + X[:, self.attribute_indices[i, :]][self.example_indices[i], :], + y[self.example_indices[i, :]])) + self.estimators_ = np.asarray(estimators_) + return self + + def sub_sample_attributes(self, X): + n_attributes = X.shape[1] + attributes_indices = np.arange(n_attributes) + kept_indices = self.random_state.choice(attributes_indices, size=int( + self.attributes_ratio * n_attributes), replace=True) + return kept_indices + + def sub_sample_examples(self, X): + n_examples = X.shape[0] + examples_indices = np.arange(n_examples) + kept_indices = self.random_state.choice(examples_indices, size=int( + self.examples_ratio * n_examples), replace=True) + return kept_indices + + def choose(self, chosen_columns): + self.estimators_ = self.estimators_[chosen_columns] + self.attribute_indices = self.attribute_indices[chosen_columns, :] + self.example_indices = self.example_indices[chosen_columns, :] + + +class StumpsClassifiersGenerator(ClassifiersGenerator): + """Decision Stump Voters transformer. + + Parameters + ---------- + n_stumps_per_attribute : int, optional + Determines how many decision stumps will be created for each attribute. Defaults to 10. + No stumps will be created for attributes with only one possible value. + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + + """ + + def __init__(self, n_stumps_per_attribute=10, self_complemented=False, + check_diff=False): + super(StumpsClassifiersGenerator, self).__init__(self_complemented) + self.n_stumps_per_attribute = n_stumps_per_attribute + self.check_diff = check_diff + + def fit(self, X, y=None): + """Fits Decision Stump voters on a training set. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Only used to ensure that we are in the binary classification setting. + + Returns + ------- + self + + """ + minimums = np.min(X, axis=0) + maximums = np.max(X, axis=0) + if y.ndim > 1: + y = np.reshape(y, (y.shape[0],)) + ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) + if self.check_diff: + nb_differents = [np.unique(col) for col in np.transpose(X)] + self.estimators_ = [] + for i in range(X.shape[1]): + nb_different = nb_differents[i].shape[0] + different = nb_differents[i] + if nb_different - 1 < self.n_stumps_per_attribute: + self.estimators_ += [DecisionStumpClassifier(i, + (different[ + stump_number] + + different[ + stump_number + 1]) / 2, + 1).fit(X, y) + for stump_number in + range(int(nb_different) - 1)] + if self.self_complemented: + self.estimators_ += [DecisionStumpClassifier(i, + (different[ + stump_number] + + different[ + stump_number + 1]) / 2, + -1).fit(X, + y) + for stump_number in + range(int(nb_different) - 1)] + else: + self.estimators_ += [DecisionStumpClassifier(i, + minimums[i] + + ranges[ + i] * stump_number, + 1).fit(X, y) + for stump_number in range(1, + self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + if self.self_complemented: + self.estimators_ += [DecisionStumpClassifier(i, + minimums[ + i] + + ranges[ + i] * stump_number, + -1).fit(X, + y) + for stump_number in range(1, + self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + else: + self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[ + i] * stump_number, 1).fit(X, y) + for i in range(X.shape[1]) for stump_number in + range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + if self.self_complemented: + self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + + ranges[ + i] * stump_number, + -1).fit(X, y) + for i in range(X.shape[1]) for stump_number + in + range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + self.estimators_ = np.asarray(self.estimators_) + return self + + def choose(self, chosen_columns): + self.estimators_ = self.estimators_[chosen_columns] + + +def _as_matrix(element): + """ Utility function to convert "anything" to a Numpy matrix. + """ + # If a scalar, return a 1x1 matrix. + if len(np.shape(element)) == 0: + return np.matrix([[element]], dtype=float) + + # If a nd-array vector, return a column matrix. + elif len(np.shape(element)) == 1: + matrix = np.matrix(element, dtype=float) + if np.shape(matrix)[1] != 1: + matrix = matrix.T + return matrix + + return np.matrix(element, dtype=float) + + +def _as_column_matrix(array_like): + """ Utility function to convert any array to a column Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[0] == 1: + matrix = matrix.T + + return matrix + + +def _as_line_matrix(array_like): + """ Utility function to convert any array to a line Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[1] == 1: + matrix = matrix.T + + return matrix + + +def sign(array): + """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. + Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. + + Parameters + ---------- + array : array-like + Input values. + + Returns + ------- + ndarray + An array with the signs of input elements. + + """ + signs = np.sign(array) + + signs[array == 0] = -1 + return signs + + +class ConvexProgram(object): + """ + Encapsulates a quadratic program of the following form: + + minimize (1/2)*x'*P*x + q'*x + subject to G*x <= h + A*x = b. + + + or a linear program of the following form: + + minimize c'*x + subject to G*x <= h + A*x = b + """ + + def __init__(self): + self._quadratic_func = None + self._linear_func = None + self._inequality_constraints_matrix = None + self._inequality_constraints_values = None + self._equality_constraints_matrix = None + self._equality_constraints_values = None + self._lower_bound_values = None + self._upper_bound_values = None + self._n_variables = None + + @property + def n_variables(self): + return self._n_variables + + @property + def quadratic_func(self): + return self._quadratic_func + + @quadratic_func.setter + def quadratic_func(self, quad_matrix): + quad_matrix = _as_matrix(quad_matrix) + n_lines, n_columns = np.shape(quad_matrix) + assert (n_lines == n_columns) + + if self._linear_func is not None: + assert (np.shape(quad_matrix)[0] == self._n_variables) + else: + self._n_variables = n_lines + + self._quadratic_func = quad_matrix + + @property + def linear_func(self): + return self._linear_func + + @linear_func.setter + def linear_func(self, lin_vector): + if lin_vector is not None: + lin_vector = _as_column_matrix(lin_vector) + + if self._quadratic_func is not None: + assert (np.shape(lin_vector)[0] == self._n_variables) + + else: + self._n_variables = np.shape(lin_vector)[0] + + self._linear_func = lin_vector + + def add_inequality_constraints(self, inequality_matrix, inequality_values): + if inequality_matrix is None: + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(inequality_matrix) or len( + np.shape(inequality_matrix)) == 1: + inequality_matrix = _as_line_matrix(inequality_matrix) + else: + inequality_matrix = _as_matrix(inequality_matrix) + + inequality_values = _as_column_matrix(inequality_values) + assert np.shape(inequality_matrix)[1] == self._n_variables + assert np.shape(inequality_values)[1] == 1 + + if self._inequality_constraints_matrix is None: + self._inequality_constraints_matrix = inequality_matrix + else: + self._inequality_constraints_matrix = np.append( + self._inequality_constraints_matrix, + inequality_matrix, axis=0) + + if self._inequality_constraints_values is None: + self._inequality_constraints_values = inequality_values + else: + self._inequality_constraints_values = np.append( + self._inequality_constraints_values, + inequality_values, axis=0) + + def add_equality_constraints(self, equality_matrix, equality_values): + if equality_matrix is None: + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(equality_matrix) or len( + np.shape(equality_matrix)) == 1: + equality_matrix = _as_line_matrix(equality_matrix) + else: + equality_matrix = _as_matrix(equality_matrix) + + equality_values = _as_matrix(equality_values) + assert np.shape(equality_matrix)[1] == self._n_variables + assert np.shape(equality_values)[1] == 1 + + if self._equality_constraints_matrix is None: + self._equality_constraints_matrix = equality_matrix + else: + self._equality_constraints_matrix = np.append( + self._equality_constraints_matrix, + equality_matrix, axis=0) + + if self._equality_constraints_values is None: + self._equality_constraints_values = equality_values + else: + self._equality_constraints_values = np.append( + self._equality_constraints_values, + equality_values, axis=0) + + def add_lower_bound(self, lower_bound): + if lower_bound is not None: + self._assert_objective_function_is_set() + self._lower_bound_values = np.array( + [lower_bound] * self._n_variables) + + def add_upper_bound(self, upper_bound): + if upper_bound is not None: + self._assert_objective_function_is_set() + self._upper_bound_values = np.array( + [upper_bound] * self._n_variables) + + def _convert_bounds_to_inequality_constraints(self): + self._assert_objective_function_is_set() + + if self._lower_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = -1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._lower_bound_values) + self._lower_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + if self._upper_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = 1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._upper_bound_values) + self._upper_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + def _convert_to_cvxopt_matrices(self): + from cvxopt import matrix as cvxopt_matrix + + if self._quadratic_func is not None: + self._quadratic_func = cvxopt_matrix(self._quadratic_func) + + if self._linear_func is not None: + self._linear_func = cvxopt_matrix(self._linear_func) + else: + # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! + self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) + + if self._inequality_constraints_matrix is not None: + self._inequality_constraints_matrix = cvxopt_matrix( + self._inequality_constraints_matrix) + + if self._inequality_constraints_values is not None: + self._inequality_constraints_values = cvxopt_matrix( + self._inequality_constraints_values) + + if self._equality_constraints_matrix is not None: + self._equality_constraints_matrix = cvxopt_matrix( + self._equality_constraints_matrix) + + if self._equality_constraints_values is not None: + self._equality_constraints_values = cvxopt_matrix( + self._equality_constraints_values) + + def _assert_objective_function_is_set(self): + assert self._n_variables is not None + + def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, + return_all_information=False): + + # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. + original_stdout = sys.stdout + original_stderr = sys.stderr + + ret = None + + # TODO: Repair + # if solver == "cvxopt": + # stdout_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stdout_logger, logging.DEBUG) + # sys.stdout = sl + + # stderr_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stderr_logger, logging.WARNING) + # sys.stderr = sl + + try: + if solver == "cvxopt": + from cvxopt.solvers import qp, lp, options + options['feastol'] = feastol + options['abstol'] = abstol + options['reltol'] = reltol + options['show_progress'] = False + + self._convert_bounds_to_inequality_constraints() + self._convert_to_cvxopt_matrices() + + if self._quadratic_func is not None: + ret = qp(self.quadratic_func, self.linear_func, + self._inequality_constraints_matrix, + self._inequality_constraints_values, + self._equality_constraints_matrix, + self._equality_constraints_values) + + else: + ret = lp(self.linear_func, + G=self._inequality_constraints_matrix, + h=self._inequality_constraints_values, + A=self._equality_constraints_matrix, + b=self._equality_constraints_values) + + # logging.info("Primal objective value = {}".format(ret['primal objective'])) + # logging.info("Dual objective value = {}".format(ret['dual objective'])) + + if not return_all_information: + ret = np.asarray(np.array(ret['x']).T[0]) + + elif solver == "cplex": + import cplex + p = cplex.Cplex() + p.objective.set_sense(p.objective.sense.minimize) + + # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP + # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of + # them...), and the values (columns of the QP matrix). + names = [str(x) for x in range(self._n_variables)] + p.variables.add(names=names) + + if self.quadratic_func is not None: + p_matrix = [] + for line in self._quadratic_func: + p_matrix.append([names, line.tolist()[0]]) + + p.objective.set_quadratic(p_matrix) + + if self.linear_func is not None: + p.objective.set_linear(zip(names, + np.asarray( + self.linear_func.T).reshape( + self.n_variables, ).tolist())) + + if self._inequality_constraints_matrix is not None: + inequality_linear = [] + for line in self._inequality_constraints_matrix: + inequality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=inequality_linear, + rhs=np.asarray( + self._inequality_constraints_values.T).tolist()[ + 0], + senses="L" * len( + self._inequality_constraints_values)) + + if self._equality_constraints_matrix is not None: + equality_linear = [] + for line in self._equality_constraints_matrix: + equality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=equality_linear, + rhs=np.asarray( + self._equality_constraints_values.T).tolist()[ + 0], + senses="E" * len( + self._equality_constraints_values)) + + if self._lower_bound_values is not None: + p.variables.set_lower_bounds( + zip(names, self._lower_bound_values)) + + if self._upper_bound_values is not None: + p.variables.set_upper_bounds( + zip(names, self._upper_bound_values)) + + p.solve() + + if not return_all_information: + ret = np.array(p.solution.get_values()) + else: + ret = {'primal': np.array(p.solution.get_values()), + 'dual': np.array(p.solution.get_dual_values())} + + elif solver == "pycpx": + # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). + + import pycpx + model = pycpx.CPlexModel(verbosity=2) + q = model.new(self.n_variables) + + if self._inequality_constraints_matrix is not None: + model.constrain( + self._inequality_constraints_matrix * q <= self._inequality_constraints_values) + if self._equality_constraints_matrix is not None: + model.constrain( + self._equality_constraints_matrix * q == self._equality_constraints_values) + if self._lower_bound_values is not None: + model.constrain(q >= self._lower_bound_values) + if self._upper_bound_values is not None: + model.constrain(q <= self._upper_bound_values) + + value = model.minimize( + 0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) + + if not return_all_information: + ret = np.array(model[q]) + else: + ret = model + + except: + raise + + finally: + sys.stdout = original_stdout + sys.stderr = original_stderr + + return ret + + def _as_matrix(element): + """ Utility function to convert "anything" to a Numpy matrix. + """ + # If a scalar, return a 1x1 matrix. + if len(np.shape(element)) == 0: + return np.matrix([[element]], dtype=float) + + # If a nd-array vector, return a column matrix. + elif len(np.shape(element)) == 1: + matrix = np.matrix(element, dtype=float) + if np.shape(matrix)[1] != 1: + matrix = matrix.T + return matrix + + return np.matrix(element, dtype=float) + + def _as_column_matrix(array_like): + """ Utility function to convert any array to a column Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[0] == 1: + matrix = matrix.T + + return matrix + + def _as_line_matrix(array_like): + """ Utility function to convert any array to a line Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[1] == 1: + matrix = matrix.T + + return matrix + + def sign(array): + """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. + Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. + + Parameters + ---------- + array : array-like + Input values. + + Returns + ------- + ndarray + An array with the signs of input elements. + + """ + signs = np.sign(array) + + signs[array == 0] = -1 + return signs + + +def get_accuracy_graph(plotted_data, classifier_name, file_name, + name="Accuracies", bounds=None, bound_name=None, + boosting_bound=None, set="train", zero_to_one=True): + if type(name) is not str: + name = " ".join(name.get_config().strip().split(" ")[:2]) + f, ax = plt.subplots(nrows=1, ncols=1) + if zero_to_one: + ax.set_ylim(bottom=0.0, top=1.0) + ax.set_title(name + " during " + set + " for " + classifier_name) + x = np.arange(len(plotted_data)) + scat = ax.scatter(x, np.array(plotted_data), marker=".") + if bounds: + if boosting_bound: + scat2 = ax.scatter(x, boosting_bound, marker=".") + scat3 = ax.scatter(x, np.array(bounds), marker=".", ) + ax.legend((scat, scat2, scat3), + (name, "Boosting bound", bound_name)) + else: + scat2 = ax.scatter(x, np.array(bounds), marker=".", ) + ax.legend((scat, scat2), + (name, bound_name)) + # plt.tight_layout() + else: + ax.legend((scat,), (name,)) + f.savefig(file_name, transparent=True) + plt.close() + + +class BaseBoost(object): + + def _collect_probas(self, X, sub_sampled=False): + if self.estimators_generator.__class__.__name__ == "TreeClassifiersGenerator": + return np.asarray([clf.predict_proba(X[:, attribute_indices]) for + clf, attribute_indices in + zip(self.estimators_generator.estimators_, + self.estimators_generator.attribute_indices)]) + else: + return np.asarray([clf.predict_proba(X) for clf in + self.estimators_generator.estimators_]) + + def _binary_classification_matrix(self, X): + probas = self._collect_probas(X) + predicted_labels = np.argmax(probas, axis=2) + predicted_labels[predicted_labels == 0] = -1 + values = np.max(probas, axis=2) + return (predicted_labels * values).T + + def _initialize_alphas(self, n_examples): + raise NotImplementedError( + "Alpha weights initialization function is not implemented.") + + def check_opposed_voters(self, ): + nb_opposed = 0 + oppposed = [] + for column in self.classification_matrix[:, + self.chosen_columns_].transpose(): + for chosen_col in self.chosen_columns_: + if (-column.reshape((self.n_total_examples, + 1)) == self.classification_matrix[:, + chosen_col].reshape( + (self.n_total_examples, 1))).all(): + nb_opposed += 1 + break + return int(nb_opposed / 2) + + +def getInterpretBase(classifier, directory, classifier_name, weights, + break_cause=" the dual constrail was not violated"): + interpretString = "\t " + classifier_name + " permformed classification with weights : \n" + # weights_sort = np.argsort(-weights) + weights_sort = np.arange(weights.shape[0]) + interpretString += np.array2string(weights[weights_sort], precision=4, + separator=',', suppress_small=True) + interpretString += "\n \t It generated {} columns by attributes and used {} iterations to converge, and selected {} couple(s) of opposed voters".format( + classifier.n_stumps, + len(weights_sort), classifier.nb_opposed_voters) + if max(weights) > 0.50: + interpretString += "\n \t The vote is useless in this context : voter nb {} is a dictator of weight > 0.50".format( + classifier.chosen_columns_[np.argmax(np.array(weights))]) + if len(weights_sort) == classifier.n_max_iterations or len( + weights) == classifier.n_total_hypotheses_: + if len(weights) == classifier.n_max_iterations: + interpretString += ", and used all available iterations, " + else: + interpretString += "." + if len(weights) == classifier.n_total_hypotheses_: + interpretString += ", and all the voters have been used." + else: + interpretString += "." + else: + pass + # interpretString += ", and the loop was broken because "+break_cause + interpretString += "\n\t Selected voters : \n" + interpretString += np.array2string( + np.array(classifier.chosen_columns_)[weights_sort]) + interpretString += "\n\t Trained in " + str(datetime.timedelta( + seconds=classifier.train_time)) + " and predicted in " + str( + datetime.timedelta(seconds=classifier.predict_time)) + "." + interpretString += "\n\t Selected columns : \n" + interpretString += np.array2string( + classifier.classification_matrix[:, classifier.chosen_columns_], + precision=4, + separator=',', suppress_small=True) + np.savetxt(directory + "voters.csv", + classifier.classification_matrix[:, classifier.chosen_columns_], + delimiter=',') + np.savetxt(directory + "weights.csv", classifier.weights_, delimiter=',') + np.savetxt(directory + "times.csv", + np.array([classifier.train_time, classifier.predict_time]), + delimiter=',') + np.savetxt(directory + "times_iter.csv", + np.array([classifier.train_time, len(weights_sort)]), + delimiter=',') + np.savetxt(directory + "sparsity.csv", np.array([len(weights_sort)]), + delimiter=',') + get_accuracy_graph(classifier.train_metrics, classifier_name, + directory + 'metrics.png', classifier.plotted_metric, + classifier.bounds, "Boosting bound") + return interpretString diff --git a/summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py b/summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py new file mode 100644 index 00000000..aa46e13e --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/additions/CBBoostUtils.py @@ -0,0 +1,532 @@ +import logging +import math +import time + +import numpy as np +import numpy.ma as ma +import scipy +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils.validation import check_is_fitted + +from .BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, \ + getInterpretBase, get_accuracy_graph, TreeClassifiersGenerator +from ...monoview.monoview_utils import change_label_to_minus +from ... import metrics + + +# Used for CBBoost + +class CBBoostClassifier(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, n_max_iterations=100, estimators_generator="Stumps", + random_state=42, self_complemented=True, twice_the_same=True, + random_start=False, n_stumps=1, c_bound_sol=True, + plotted_metric=metrics.zero_one_loss, save_train_data=False, + test_graph=True, mincq_tracking=False): + super(CBBoostClassifier, self).__init__() + r""" + + Parameters + ---------- + n_max_iterations : int + Maximum number of iterations for the boosting algorithm. + estimators_generator : object + Sk-learn classifier object used to generate the hypotheses with the data. + random_state : np.random.RandomState or int + The random state, used in order to be reproductible + self_complemented : bool + If True, in the hypotheses generation process, for each hypothesis, it's complement will be generated too. + twice_the_same : bool + If True, the algorithm will be allowed to select twice the same hypothesis in the boosting process. + c_bound_choice : bool + If True, the C-Bound will be used to select the hypotheses. If False, the margin will be the criterion. + n_stumps_per_attribute : int + The number of hypotheses generated by data attribute + use_r : bool + If True, uses edge to compute the performance of a voter. If False, use the error instead. + plotted_metric : Metric module + The metric that will be plotted for each iteration of boosting. + """ + if type(random_state) is int: + self.random_state = np.random.RandomState(random_state) + else: + self.random_state = random_state + self.train_time = 0 + self.train_shape = None + self.step_decisions = None + self.step_prod = None + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.self_complemented = self_complemented + self.twice_the_same = twice_the_same + self.random_start = random_start + self.plotted_metric = plotted_metric + self.n_stumps = n_stumps + self.c_bound_sol = c_bound_sol + self.save_train_data = save_train_data + self.test_graph = test_graph + self.printed_args_name_list = ["n_max_iterations", "self_complemented", + "twice_the_same", + "random_start", + "n_stumps",] + self.mincq_tracking = mincq_tracking + + def fit(self, X, y): + formatted_X, formatted_y = self.format_X_y(X, y) + + self.init_info_containers() + + # Initialize the weak classifiers ensemble + m, n, y_kernel_matrix = self.init_hypotheses(formatted_X, formatted_y) + + start = time.time() + self.n_total_hypotheses_ = n + self.n_total_examples = m + + # Initialize the majority vote + self.init_boosting(m, formatted_y, y_kernel_matrix) + + self.break_cause = " the maximum number of iterations was attained." + + for k in range(min(n - 1, + self.n_max_iterations - 1 if self.n_max_iterations is not None else np.inf)): + + # Print dynamically the step and the error of the current classifier + self.it = k + print( + "Resp. bound : {}/{}".format( + k + 2, + self.n_max_iterations), + end="\r") + + # Find the best (weight, voter) couple. + self.q, new_voter_index = self._find_new_voter(y_kernel_matrix, + formatted_y) + + if type(self.q) == str: + self.break_cause = new_voter_index # + break + + self.append_new_voter(new_voter_index) + self.weights_.append(self.q) + + voter_perf = self.compute_voter_perf(formatted_y) + + self.update_info_containers(formatted_y, voter_perf, k) + + self.estimators_generator.choose(self.chosen_columns_) + + self.nb_opposed_voters = self.check_opposed_voters() + if self.save_train_data: + self.X_train = self.classification_matrix[:, self.chosen_columns_] + self.raw_weights = self.weights_ + self.y_train = formatted_y + + self.weights_ = np.array(self.weights_)/np.sum(np.array(self.weights_)) + + formatted_y[formatted_y == -1] = 0 + formatted_y = formatted_y.reshape((m,)) + + end = time.time() + self.train_time = end - start + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + + classification_matrix = self._binary_classification_matrix(X) + margins = np.sum(classification_matrix * self.weights_, axis=1) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + + end = time.time() + self.predict_time = end - start + + # Predict for each step of the boosting process + self.step_predict(classification_matrix) + + return signs_array + + def step_predict(self, classification_matrix): + """Used to predict with each step of the greedy algorithm to analyze its performance increase""" + if classification_matrix.shape != self.train_shape: + self.step_decisions = np.zeros(classification_matrix.shape) + self.mincq_step_decisions = np.zeros(classification_matrix.shape) + self.step_prod = np.zeros(classification_matrix.shape) + for weight_index in range(self.weights_.shape[0] - 1): + margins = np.sum( + classification_matrix[:, :weight_index + 1] * self.weights_[ + :weight_index + 1], + axis=1) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + self.step_decisions[:, weight_index] = signs_array + self.step_prod[:, weight_index] = np.sum( + classification_matrix[:, :weight_index + 1] * self.weights_[ + :weight_index + 1], + axis=1) + if self.mincq_tracking: + if weight_index == 0: + self.mincq_step_decisions[:, weight_index] = signs_array + else: + mincq_margins = np.sum(self.mincq_learners[ + weight_index - 1].majority_vote._weights * classification_matrix[ + :, + :weight_index + 1], + axis=1) + mincq_signs_array = np.array( + [int(x) for x in sign(mincq_margins)]) + mincq_signs_array[mincq_signs_array == -1] = 0 + self.mincq_step_decisions[:, + weight_index] = mincq_signs_array + # self.mincq_step_cbounds = self.mincq_learners[weight_index-1].majority_vote.cbound_value() + + def update_info_containers(self, y, voter_perf, k): + """Is used at each iteration to compute and store all the needed quantities for later analysis""" + self.tau.append( + np.sum(np.multiply(self.previous_vote, self.new_voter)) / float( + self.n_total_examples)) + # print(np.sum(np.multiply(self.previous_vote, self.new_voter))/float(self.n_total_examples)) + self.previous_vote += self.q * self.new_voter + self.norm.append(np.linalg.norm(self.previous_vote) ** 2) + self.previous_votes.append(self.previous_vote) + self.previous_margins.append( + np.sum(np.multiply(y, self.previous_vote)) / float( + self.n_total_examples)) + self.selected_margins.append( + np.sum(np.multiply(y, self.new_voter)) / float( + self.n_total_examples)) + train_metric = self.plotted_metric.score(y, np.sign(self.previous_vote)) + self.train_metrics.append(train_metric) + + # Used to compute the optimal c-bound distribution on the chose set + if self.mincq_tracking: + from ...monoview_classifiers.min_cq import MinCqLearner + mincq = MinCqLearner(10e-3, "stumps", n_stumps_per_attribute=1, + self_complemented=False) + training_set = self.classification_matrix[:, self.chosen_columns_] + mincq.fit(training_set, y) + mincq_pred = mincq.predict(training_set) + self.mincq_learners.append(mincq) + self.mincq_train_metrics.append( + self.plotted_metric.score(y, change_label_to_minus(mincq_pred))) + self.mincq_weights.append(mincq.majority_vote._weights) + self.mincq_c_bounds.append( + mincq.majority_vote.cbound_value(training_set, + y.reshape((y.shape[0],)))) + + def compute_voter_perf(self, formatted_y): + """Used to computer the performance (error or edge) of the selected voter""" + epsilon = self._compute_epsilon(formatted_y) + self.voter_perfs.append(epsilon) + return epsilon + + def _compute_epsilon(self, y): + """Updating the error variable, the old fashioned way uses the whole majority vote to update the error""" + ones_matrix = np.zeros(y.shape) + ones_matrix[np.multiply(y, self.new_voter.reshape( + y.shape)) < 0] = 1 # can np.divide if needed + epsilon = np.average(np.multiply(y, self.new_voter.reshape( + y.shape)), axis=0) + return epsilon + + def append_new_voter(self, new_voter_index): + """Used to append the voter to the majority vote""" + self.chosen_columns_.append(new_voter_index) + self.new_voter = self.classification_matrix[:, new_voter_index].reshape( + (self.n_total_examples, 1)) + + def init_boosting(self, m, y, y_kernel_matrix): + """THis initialization corressponds to the first round of boosting with equal weights for each examples and the voter chosen by it's margin.""" + + if self.random_start: + first_voter_index = self.random_state.choice( + np.where(np.sum(y_kernel_matrix, axis=0) > 0)[0]) + else: + first_voter_index, _ = self._find_best_weighted_margin( + y_kernel_matrix) + + self.chosen_columns_.append(first_voter_index) + self.new_voter = np.array(self.classification_matrix[:, + first_voter_index].reshape((m, 1)), copy=True) + + self.previous_vote = self.new_voter + self.norm.append(np.linalg.norm(self.previous_vote) ** 2) + + self.q = 1 + self.weights_.append(self.q) + + self.previous_margins.append( + np.sum(np.multiply(y, self.previous_vote)) / float( + self.n_total_examples)) + self.selected_margins.append(np.sum(np.multiply(y, self.previous_vote))) + self.tau.append( + np.sum(np.multiply(self.previous_vote, self.new_voter)) / float( + self.n_total_examples)) + + train_metric = self.plotted_metric.score(y, np.sign(self.previous_vote)) + self.train_metrics.append(train_metric) + + if self.mincq_tracking: + self.mincq_train_metrics.append(train_metric) + + def format_X_y(self, X, y): + """Formats the data : X -the examples- and y -the labels- to be used properly by the algorithm """ + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + # Initialization + y_neg = change_label_to_minus(y) + y_neg = y_neg.reshape((y.shape[0], 1)) + return X, y_neg + + def init_hypotheses(self, X, y): + """Inintialization for the hyptotheses used to build the boosted vote""" + if self.estimators_generator is "Stumps": + self.estimators_generator = StumpsClassifiersGenerator( + n_stumps_per_attribute=self.n_stumps, + self_complemented=self.self_complemented) + if self.estimators_generator is "Trees": + self.estimators_generator = TreeClassifiersGenerator( + n_trees=self.n_stumps, max_depth=self.max_depth, + self_complemented=self.self_complemented) + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + self.train_shape = self.classification_matrix.shape + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y, self.classification_matrix) + + return m, n, y_kernel_matrix + + def init_info_containers(self): + """Initialize the containers that will be collected at each iteration for the analysis""" + self.weights_ = [] + self.chosen_columns_ = [] + self.fobidden_columns = [] + self.c_bounds = [] + self.voter_perfs = [] + self.example_weights_ = [] + self.train_metrics = [] + self.bounds = [] + self.disagreements = [] + self.margins = [] + self.previous_votes = [] + self.previous_margins = [] + self.respected_bound = True + self.selected_margins = [] + self.tau = [] + self.norm = [] + self.mincq_train_metrics = [] + self.mincq_c_bounds = [] + self.mincq_weights = [] + self.mincq_learners = [] + self.mincq_step_decisions = [] + + + def _find_best_weighted_margin(self, y_kernel_matrix, upper_bound=1.0, + lower_bound=0.0): + """Finds the new voter by choosing the one that has the best weighted margin between 0.5 and 0.55 + to avoid too god voters that will get all the votes weights""" + pseudo_h_values = ma.array(np.sum(y_kernel_matrix, axis=0), + fill_value=-np.inf) + pseudo_h_values[self.chosen_columns_] = ma.masked + return np.argmax(pseudo_h_values), [0] + + def _find_new_voter(self, y_kernel_matrix, y): + """Here, we solve the two_voters_mincq_problem for each potential new voter, + and select the one that has the smallest minimum""" + m = y_kernel_matrix.shape[0] + previous_sum = np.multiply(y, + self.previous_vote.reshape(m, 1)) + margin_old = np.sum(previous_sum) + + bad_margins = np.where(np.sum(y_kernel_matrix, axis=0) <= 0.0)[0] + + self.B2 = m + self.B1s = np.sum( + 2 * np.multiply(previous_sum, y_kernel_matrix), + axis=0) + self.B0 = np.sum(previous_sum ** 2) + + self.A2s = np.sum(y_kernel_matrix, axis=0) ** 2 + self.A1s = np.sum(y_kernel_matrix, axis=0) * margin_old * 2 + self.A0 = margin_old ** 2 + + C2s = (self.A1s * self.B2 - self.A2s * self.B1s) + C1s = 2 * (self.A0 * self.B2 - self.A2s * self.B0) + C0s = self.A0 * self.B1s - self.A1s * self.B0 + + sols = np.zeros(C0s.shape) - 3 + sols[np.where(C2s != 0)[0]] = (-C1s[np.where(C2s != 0)[0]] + np.sqrt( + C1s[np.where(C2s != 0)[0]] * C1s[np.where(C2s != 0)[0]] - 4 * C2s[ + np.where(C2s != 0)[0]] * C0s[np.where(C2s != 0)[0]])) / ( + 2 * C2s[ + np.where(C2s != 0)[0]]) + + masked_c_bounds = self.make_masked_c_bounds(sols, bad_margins) + if masked_c_bounds.mask.all(): + return "No more pertinent voters", 0 + else: + best_hyp_index = np.argmin(masked_c_bounds) + + self.c_bounds.append(masked_c_bounds[best_hyp_index]) + self.margins.append(math.sqrt(self.A2s[best_hyp_index] / m)) + self.disagreements.append(0.5 * self.B1s[best_hyp_index] / m) + return sols[best_hyp_index], best_hyp_index + + def make_masked_c_bounds(self, sols, bad_margins): + c_bounds = self.compute_c_bounds(sols) + trans_c_bounds = self.compute_c_bounds(sols + 1) + masked_c_bounds = ma.array(c_bounds, fill_value=np.inf) + # Masing Maximums + masked_c_bounds[c_bounds >= trans_c_bounds] = ma.masked + # Masking magrins <= 0 + masked_c_bounds[bad_margins] = ma.masked + # Masking weights < 0 (because self-complemented) + masked_c_bounds[sols < 0] = ma.masked + # Masking nan c_bounds + masked_c_bounds[np.isnan(c_bounds)] = ma.masked + if not self.twice_the_same: + masked_c_bounds[self.chosen_columns_] = ma.masked + return masked_c_bounds + + def compute_c_bounds(self, sols): + return 1 - (self.A2s * sols ** 2 + self.A1s * sols + self.A0) / (( + self.B2 * sols ** 2 + self.B1s * sols + self.B0) * self.n_total_examples) + + def _cbound(self, sol): + """Computing the objective function""" + return 1 - (self.A2 * sol ** 2 + self.A1 * sol + self.A0) / (( + self.B2 * sol ** 2 + self.B1 * sol + self.B0) * self.n_total_examples) + + def disagreement(self, sol): + return ( + self.B2 * sol ** 2 + self.B1 * sol + self.B0) / self.n_total_examples + + def margin(self, sol): + return ( + self.A2 * sol ** 2 + self.A1 * sol + self.A0) / self.n_total_examples + + def _best_sol(self, sols): + """Return the best min in the two possible sols""" + values = np.array([self._cbound(sol) for sol in sols]) + return sols[np.argmin(values)] + + def get_step_decision_test_graph(self, directory, y_test): + np.savetxt(directory + "y_test_step.csv", self.step_decisions, + delimiter=',') + step_metrics = [] + for step_index in range(self.step_decisions.shape[1] - 1): + step_metrics.append(self.plotted_metric.score(y_test, + self.step_decisions[:, + step_index])) + step_metrics = np.array(step_metrics) + np.savetxt(directory + "step_test_metrics.csv", step_metrics, + delimiter=',') + get_accuracy_graph(step_metrics, self.__class__.__name__, + directory + 'step_test_metrics.png', + self.plotted_metric, set="test") + + if self.mincq_tracking: + step_mincq_test_metrics = [] + for step_index in range(self.step_decisions.shape[1] - 1): + step_mincq_test_metrics.append(self.plotted_metric.score(y_test, + self.mincq_step_decisions[ + :, + step_index])) + np.savetxt(directory + "mincq_step_test_metrics.csv", + step_mincq_test_metrics, + delimiter=',') + get_accuracy_graph(step_metrics, self.__class__.__name__, + directory + 'step_test_metrics_comparaison.png', + self.plotted_metric, step_mincq_test_metrics, + "MinCQ metric", set="test") + + step_cbounds = [] + for step_index in range(self.step_prod.shape[1]): + num = np.sum(y_test * self.step_prod[:, step_index]) ** 2 + den = np.sum((self.step_prod[:, step_index]) ** 2) + step_cbounds.append(1 - num / (den * self.step_prod.shape[0])) + step_cbounds = np.array(step_cbounds) + np.savetxt(directory + "step_test_c_bounds.csv", step_cbounds, + delimiter=',') + get_accuracy_graph(step_cbounds, self.__class__.__name__, + directory + 'step_test_c_bounds.png', + "C_bound", set="test") + + def getInterpretCBBoost(self, directory, y_test=None): + self.directory = directory + """Used to interpret the functionning of the algorithm""" + if self.step_decisions is not None: + self.get_step_decision_test_graph(directory, y_test) + # get_accuracy_graph(self.voter_perfs[:20], self.__class__.__name__, + # directory + 'voter_perfs.png', "Rs") + get_accuracy_graph(self.weights_, self.__class__.__name__, + directory + 'vote_weights.png', "weights", + zero_to_one=False) + get_accuracy_graph(self.c_bounds, self.__class__.__name__, + directory + 'c_bounds.png', "C-Bounds") + if self.mincq_tracking: + get_accuracy_graph(self.c_bounds, self.__class__.__name__, + directory + 'c_bounds_comparaison.png', + "1-var mins", self.mincq_c_bounds, "MinCQ min", + zero_to_one=False) + get_accuracy_graph(self.train_metrics, self.__class__.__name__, + directory + 'train_metrics_comparaison.png', + self.plotted_metric, + self.mincq_train_metrics, "MinCQ metrics") + get_accuracy_graph(self.previous_margins, self.__class__.__name__, + directory + 'margins.png', "Margins", + zero_to_one=False) + get_accuracy_graph(self.selected_margins, self.__class__.__name__, + directory + 'selected_margins.png', + "Selected Margins") + self.tau[0] = 0 + get_accuracy_graph(self.tau, self.__class__.__name__, + directory + 'disagreements.png', "disagreements", + zero_to_one=False) + get_accuracy_graph(self.train_metrics[:-1], self.__class__.__name__, + directory + 'c_bounds_train_metrics.png', + self.plotted_metric, self.c_bounds, "C-Bound", + self.bounds[:-1]) + get_accuracy_graph(self.norm, self.__class__.__name__, + directory + 'norms.png', + "squared 2-norm", zero_to_one=False) + interpretString = getInterpretBase(self, directory, + self.__class__.__name__, + self.weights_, self.break_cause) + if self.save_train_data: + np.savetxt(directory + "x_train.csv", self.X_train, delimiter=',') + np.savetxt(directory + "y_train.csv", self.y_train, delimiter=',') + np.savetxt(directory + "raw_weights.csv", self.raw_weights, + delimiter=',') + np.savetxt(directory + "c_bounds.csv", self.c_bounds, delimiter=',') + np.savetxt(directory + "train_metrics.csv", self.train_metrics, + delimiter=',') + np.savetxt(directory + "margins.csv", self.previous_margins, + delimiter=',') + np.savetxt(directory + "disagreements.csv", self.tau, + delimiter=',') + np.savetxt(directory + "disagreements.csv", self.norm, + delimiter=',') + if self.mincq_tracking: + np.savetxt(directory + "mincq_cbounds.csv", self.mincq_c_bounds, + delimiter=',') + np.savetxt(directory + "mincq_train_metrics.csv", + self.mincq_train_metrics, + delimiter=',') + args_dict = dict( + (arg_name, str(self.__dict__[arg_name])) for arg_name in + self.printed_args_name_list) + interpretString += "\n \n With arguments : \n" + u'\u2022 ' + ( + "\n" + u'\u2022 ').join(['%s: \t%s' % (key, value) + for (key, value) in + args_dict.items()]) + if not self.respected_bound: + interpretString += "\n\n The bound was not respected" + + return interpretString diff --git a/summit/multiview_platform/monoview_classifiers/additions/CQBoostUtils.py b/summit/multiview_platform/monoview_classifiers/additions/CQBoostUtils.py new file mode 100644 index 00000000..40122b1d --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/additions/CQBoostUtils.py @@ -0,0 +1,335 @@ +import logging +import math +import time +from collections import defaultdict + +import numpy as np +import numpy.ma as ma +import scipy +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.metrics import accuracy_score +from sklearn.utils.validation import check_is_fitted + +from .BoostUtils import StumpsClassifiersGenerator, ConvexProgram, sign, \ + BaseBoost, TreeClassifiersGenerator +from ... import metrics + + +class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin, BaseBoost): + def __init__(self, mu=0.01, epsilon=1e-06, n_max_iterations=100, + estimators_generator="Stumps", dual_constraint_rhs=0, max_depth=1, + save_iteration_as_hyperparameter_each=None, random_state=None): + super(ColumnGenerationClassifier, self).__init__() + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.mu = mu + self.max_depth=max_depth + self.train_time = 0 + self.plotted_metric = metrics.zero_one_loss + self.random_state = random_state + + def fit(self, X, y): + if scipy.sparse.issparse(X): + X = np.array(X.todense()) + + y[y == 0] = -1 + + if self.estimators_generator is "Stumps": + self.estimators_generator = StumpsClassifiersGenerator( + n_stumps_per_attribute=self.n_stumps, self_complemented=True) + elif self.estimators_generator is "Trees": + self.estimators_generator = TreeClassifiersGenerator( + max_depth=self.max_depth, n_trees=self.n_stumps, + self_complemented=True) + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + self.c_bounds = [] + + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + self.chosen_columns_ = [] + self.n_total_hypotheses_ = n + self.n_total_examples = m + self.train_shape = self.classification_matrix.shape + + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), + self.classification_matrix) + + # Initialization + alpha = self._initialize_alphas(m) + self.initialize() + self.train_metrics = [] + self.gammas = [] + self.list_weights = [] + self.bounds = [] + self.previous_votes = [] + # w = [0.5,0.5] + w = None + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + start = time.time() + + for k in range(min(n, + self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # Find worst weak hypothesis given alpha. + h_values = ma.array( + np.squeeze(np.array((alpha).T.dot(y_kernel_matrix).T)), + fill_value=-np.inf) + + if self.chosen_columns_: + h_values[self.chosen_columns_] = ma.masked + + worst_h_index = ma.argmax(h_values) + + # Check for optimal solution. We ensure at least one complete iteration is done as the initialization + # values might provide a degenerate initial solution. + if self.chosen_columns_: + if h_values[ + worst_h_index] <= self.dual_constraint_rhs + self.epsilon and len( + self.chosen_columns_) > 0: + break + + # Append the weak hypothesis. + self.chosen_columns_.append(worst_h_index) + self.matrix_to_optimize = self.get_matrix_to_optimize( + y_kernel_matrix, w) + + # Solve restricted master for new costs. + w, alpha = self._restricted_master_problem(previous_w=w, + previous_alpha=alpha) + cbound = self.compute_empiric_cbound(w, y_kernel_matrix) + self.c_bounds.append(cbound) + self.list_weights.append(w) + + self.update_values(h_values, worst_h_index, alpha, w) + + margins = self.get_margins(w) + signs_array = np.array([int(x) for x in sign(margins)]) + self.train_metrics.append(self.plotted_metric.score(y, signs_array)) + self.gammas.append(accuracy_score(y, signs_array)) + self.bounds.append( + math.exp(-2 * np.sum(np.square(np.array(self.gammas))))) + + self.nb_opposed_voters = self.check_opposed_voters() + self.compute_weights_(w) + # self.weights_ = w + self.estimators_generator.choose(self.chosen_columns_) + end = time.time() + + self.train_time = end - start + y[y == -1] = 0 + return self + + def predict(self, X): + start = time.time() + check_is_fitted(self, 'weights_') + + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + + classification_matrix = self._binary_classification_matrix(X) + margins = np.squeeze( + np.asarray(np.dot(classification_matrix, self.weights_))) + + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + end = time.time() + self.predict_time = end - start + self.step_predict(classification_matrix) + return signs_array + + def compute_empiric_cbound(self, w, y_kernel_matrix): + cbound = 1 - (1.0 / self.n_total_examples) * (np.sum( + np.average(y_kernel_matrix[:, self.chosen_columns_], axis=1, + weights=w)) ** 2 / + np.sum(np.average( + y_kernel_matrix[:, + self.chosen_columns_], + axis=1, + weights=w) ** 2)) + return cbound + + def step_predict(self, classification_matrix): + if classification_matrix.shape != self.train_shape: + self.step_decisions = np.zeros(classification_matrix.shape) + self.step_prod = np.zeros(classification_matrix.shape) + for weight_index in range(self.weights_.shape[0] - 1): + margins = np.sum(classification_matrix[:, :weight_index + 1] * + self.list_weights[weight_index], axis=1) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + self.step_decisions[:, weight_index] = signs_array + self.step_prod[:, weight_index] = np.sum( + classification_matrix[:, :weight_index + 1] * self.weights_[ + :weight_index + 1], + axis=1) + + def initialize(self): + pass + + def update_values(self, h_values=None, worst_h_index=None, alpha=None, + w=None): + pass + + def get_margins(self, w): + margins = np.squeeze(np.asarray( + np.dot(self.classification_matrix[:, self.chosen_columns_], w))) + return margins + + def compute_weights_(self, w=None): + self.weights_ = w + + def get_matrix_to_optimize(self, y_kernel_matrix, w=None): + return y_kernel_matrix[:, self.chosen_columns_] + + # def _binary_classification_matrix(self, X): + # probas = self._collect_probas(X) + # predicted_labels = np.argmax(probas, axis=2) + # predicted_labels[predicted_labels == 0] = -1 + # values = np.max(probas, axis=2) + # return (predicted_labels * values).T + # + # def _collect_probas(self, X): + # return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) + + def _restricted_master_problem(self, previous_w=None, previous_alpha=None): + n_examples, n_hypotheses = self.matrix_to_optimize.shape + + m_eye = np.eye(n_examples) + m_ones = np.ones((n_examples, 1)) + + qp_a = np.vstack((np.hstack((-self.matrix_to_optimize, m_eye)), + np.hstack((np.ones((1, n_hypotheses)), + np.zeros((1, n_examples)))))) + + qp_b = np.vstack((np.zeros((n_examples, 1)), + np.array([1.0]).reshape((1, 1)))) + + qp_g = np.vstack((np.hstack( + (-np.eye(n_hypotheses), np.zeros((n_hypotheses, n_examples)))), + np.hstack((np.zeros((1, n_hypotheses)), + - 1.0 / n_examples * m_ones.T)))) + + qp_h = np.vstack((np.zeros((n_hypotheses, 1)), + np.array([-self.mu]).reshape((1, 1)))) + + qp = ConvexProgram() + qp.quadratic_func = 2.0 / n_examples * np.vstack((np.hstack((np.zeros( + (n_hypotheses, n_hypotheses)), np.zeros( + (n_hypotheses, n_examples)))), + np.hstack((np.zeros(( + n_examples, + n_hypotheses)), + m_eye)))) + + qp.add_equality_constraints(qp_a, qp_b) + qp.add_inequality_constraints(qp_g, qp_h) + + if previous_w is not None: + qp.initial_values = np.append(previous_w, [0]) + + try: + solver_result = qp.solve(abstol=1e-10, reltol=1e-10, feastol=1e-10, + return_all_information=True) + w = np.asarray(np.array(solver_result['x']).T[0])[:n_hypotheses] + + # The alphas are the Lagrange multipliers associated with the equality constraints (returned as the y vector in CVXOPT). + dual_variables = np.asarray(np.array(solver_result['y']).T[0]) + alpha = dual_variables[:n_examples] + + # Set the dual constraint right-hand side to be equal to the last lagrange multiplier (nu). + # Hack: do not change nu if the QP didn't fully solve... + if solver_result['dual slack'] <= 1e-8: + self.dual_constraint_rhs = dual_variables[-1] + # logging.info('Updating dual constraint rhs: {}'.format(self.dual_constraint_rhs)) + + except: + logging.warning( + 'QP Solving failed at iteration {}.'.format(n_hypotheses)) + if previous_w is not None: + w = np.append(previous_w, [0]) + else: + w = np.array([1.0 / n_hypotheses] * n_hypotheses) + + if previous_alpha is not None: + alpha = previous_alpha + else: + alpha = self._initialize_alphas(n_examples) + + return w, alpha + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + +# class CqBoostClassifier(ColumnGenerationClassifier): +# def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None): +# super(CqBoostClassifier, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, +# save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each) +# # TODO: Verifier la valeur de nu (dual_constraint_rhs) a l'initialisation, mais de toute maniere ignoree car +# # on ne peut pas quitter la boucle principale avec seulement un votant. +# self.mu = mu +# self.train_time = 0 +# +# def _restricted_master_problem(self, y_kernel_matrix, previous_w=None, previous_alpha=None): +# n_examples, n_hypotheses = y_kernel_matrix.shape +# +# m_eye = np.eye(n_examples) +# m_ones = np.ones((n_examples, 1)) +# +# qp_a = np.vstack((np.hstack((-y_kernel_matrix, m_eye)), +# np.hstack((np.ones((1, n_hypotheses)), np.zeros((1, n_examples)))))) +# +# qp_b = np.vstack((np.zeros((n_examples, 1)), +# np.array([1.0]).reshape((1, 1)))) +# +# qp_g = np.vstack((np.hstack((-np.eye(n_hypotheses), np.zeros((n_hypotheses, n_examples)))), +# np.hstack((np.zeros((1, n_hypotheses)), - 1.0 / n_examples * m_ones.T)))) +# +# qp_h = np.vstack((np.zeros((n_hypotheses, 1)), +# np.array([-self.mu]).reshape((1, 1)))) +# +# qp = ConvexProgram() +# qp.quadratic_func = 2.0 / n_examples * np.vstack((np.hstack((np.zeros((n_hypotheses, n_hypotheses)), np.zeros((n_hypotheses, n_examples)))), +# np.hstack((np.zeros((n_examples, n_hypotheses)), m_eye)))) +# +# qp.add_equality_constraints(qp_a, qp_b) +# qp.add_inequality_constraints(qp_g, qp_h) +# +# if previous_w is not None: +# qp.initial_values = np.append(previous_w, [0]) +# +# try: +# solver_result = qp.solve(abstol=1e-10, reltol=1e-10, feastol=1e-10, return_all_information=True) +# w = np.asarray(np.array(solver_result['x']).T[0])[:n_hypotheses] +# +# # The alphas are the Lagrange multipliers associated with the equality constraints (returned as the y vector in CVXOPT). +# dual_variables = np.asarray(np.array(solver_result['y']).T[0]) +# alpha = dual_variables[:n_examples] +# +# # Set the dual constraint right-hand side to be equal to the last lagrange multiplier (nu). +# # Hack: do not change nu if the QP didn't fully solve... +# if solver_result['dual slack'] <= 1e-8: +# self.dual_constraint_rhs = dual_variables[-1] +# # logging.info('Updating dual constraint rhs: {}'.format(self.dual_constraint_rhs)) +# +# except: +# logging.warning('QP Solving failed at iteration {}.'.format(n_hypotheses)) +# if previous_w is not None: +# w = np.append(previous_w, [0]) +# else: +# w = np.array([1.0 / n_hypotheses] * n_hypotheses) +# +# if previous_alpha is not None: +# alpha = previous_alpha +# else: +# alpha = self._initialize_alphas(n_examples) +# +# return w, alpha +# +# def _initialize_alphas(self, n_examples): +# return 1.0 / n_examples * np.ones((n_examples,)) diff --git a/summit/multiview_platform/monoview_classifiers/additions/MinCQUtils.py b/summit/multiview_platform/monoview_classifiers/additions/MinCQUtils.py new file mode 100644 index 00000000..af31d52b --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/additions/MinCQUtils.py @@ -0,0 +1,321 @@ +# -*- coding: utf-8 -*- +"""MinCq algorithm. + +Related papers: +[1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) +[2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2015) + +""" +from __future__ import print_function, division, absolute_import +import time +from operator import xor + +import numpy as np +from sklearn.ensemble import VotingClassifier +from sklearn.manifold import SpectralEmbedding +from sklearn.preprocessing import LabelEncoder +from sklearn.utils.graph import graph_laplacian +from sklearn.utils.validation import check_X_y + +from .BoostUtils import ConvexProgram +from ...monoview.monoview_utils import change_label_to_zero, change_label_to_minus + + +class MinCqClassifier(VotingClassifier): + """ + Base MinCq algorithm learner. See [1, 2]. + This version is an attempt of creating a more general version of MinCq, that handles multiclass classfication. + For binary classification, use RegularizedMinCqClassifer. + + Parameters + ---------- + mu : float + The fixed value of the first moment of the margin. + + """ + + def __init__(self, estimators_generator=None, estimators=None, mu=0.001, + omega=0.5, use_binary=False, zeta=0, gamma=1, n_neighbors=5): + if estimators is None: + estimators = [] + + super().__init__(estimators=estimators, voting='soft', + flatten_transform=False) + self.estimators_generator = estimators_generator + self.mu = mu + self.omega = omega + self.use_binary = use_binary + self.zeta = zeta + self.gamma = gamma + self.n_neighbors = n_neighbors + + def fit(self, X, y): + """Fit the estimators and learn the weights. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + y : array-like, shape = [n_samples] + Target values. If y is a masked-array (numpy.ma), the masked values are unlabeled examples. + + Returns + ------- + self : object + + """ + # Validations + assert 0 < self.mu <= 1, "MinCqClassifier: mu parameter must be in (0, 1]" + assert xor(bool(self.estimators_generator), bool( + self.estimators)), "MinCqClassifier: exactly one of estimator_generator or estimators must be used." + X, y = check_X_y(X, change_label_to_minus(y)) + + # Fit the estimators using VotingClassifier's fit method. This will also fit a LabelEncoder that can be + # used to "normalize" labels (0, 1, 2, ...). In the case of binary classification, the two classes will be 0 and 1. + # First, ensure that the weights are reset to None (as cloning a VotingClassifier keeps the weights) + self.weights = None + # TODO: Ensure estimators can deal with masked arrays + + # If we use an estimator generator, use the data-dependant estimator generator to generate them, and fit again. + if self.estimators: + super().fit(X, y) + + else: + self.le_ = LabelEncoder() + self.le_.fit(y) + self.clean_me = True + + if isinstance(y, np.ma.MaskedArray): + transformed_y = np.ma.MaskedArray(self.le_.transform(y), y.mask) + else: + # transformed_y = self.le_.transform(y) + transformed_y = y + + self.estimators_generator.fit(X, transformed_y) + self.estimators = [('ds{}'.format(i), estimator) for i, estimator in + enumerate(self.estimators_generator.estimators_)] + super().fit(X, y) + + beg = time.time() + + # Preparation and resolution of the quadratic program + # logger.info("Preparing and solving QP...") + self.weights = self._solve(X, y) + if self.clean_me: + self.estimators = [] + # print(self.weights.shape) + # print(np.unique(self.weights)[0:10]) + # import pdb;pdb.set_trace() + self.train_cbound = 1 - (1.0 / X.shape[0]) * (np.sum( + np.multiply(change_label_to_minus(y), + np.average(self._binary_classification_matrix(X), + axis=1, weights=self.weights))) ** 2) / ( + np.sum(np.average( + self._binary_classification_matrix(X), + axis=1, weights=self.weights) ** 2)) + end = time.time() + self.train_time = end-beg + return self + + def _binary_classification_matrix(self, X): + probas = self.transform(X) + predicted_labels = np.argmax(probas, axis=2) + predicted_labels[predicted_labels == 0] = -1 + values = np.max(probas, axis=2) + return (predicted_labels * values).T + + def _multiclass_classification_matrix(self, X, y): + probas = self.transform(X).swapaxes(0, 1) + matrix = probas[np.arange(probas.shape[0]), :, y] + + return (matrix - self.omega) + + def predict(self, X): + if not self.estimators: + self.estimators = [('ds{}'.format(i), estimator) for i, estimator in + enumerate(self.estimators_generator.estimators_)] + self.clean_me = True + pred = super().predict(X) + if self.clean_me: + self.estimators = [] + return change_label_to_zero(pred) + + def _solve(self, X, y): + y = self.le_.transform(y) + + if self.use_binary: + assert len(self.le_.classes_) == 2 + + # TODO: Review the number of labeled examples when adding back the support for transductive learning. + classification_matrix = self._binary_classification_matrix(X) + + # We use {-1, 1} labels. + binary_labels = np.copy(y) + binary_labels[y == 0] = -1 + + multi_matrix = binary_labels.reshape( + (len(binary_labels), 1)) * classification_matrix + + else: + multi_matrix = self._multiclass_classification_matrix(X, y) + + n_examples, n_voters = np.shape(multi_matrix) + ftf = 1.0 / n_examples * multi_matrix.T.dot(multi_matrix) + yf = np.mean(multi_matrix, axis=0) + + # Objective function. + objective_matrix = 2 * ftf + objective_vector = None + + # Equality constraints (first moment of the margin equal to mu, Q sums to one) + equality_matrix = np.vstack( + (yf.reshape((1, n_voters)), np.ones((1, n_voters)))) + equality_vector = np.array([self.mu, 1.0]) + + # Lower and upper bounds, no quasi-uniformity. + lower_bound = 0.0 + # TODO: In the case of binary classification, no upper bound will give + # bad results. Using 1/n works, as it brings back the l_infinity + # regularization normally given by the quasi-uniformity constraint. + # upper_bound = 2.0/n_voters + upper_bound = None + + weights = self._solve_qp(objective_matrix, objective_vector, + equality_matrix, equality_vector, lower_bound, + upper_bound) + + # Keep learning information for further use. + self.learner_info_ = {} + + # We count the number of non-zero weights, including the implicit voters. + # TODO: Verify how we define non-zero weights here, could be if the weight is near 1/2n. + n_nonzero_weights = np.sum(np.asarray(weights) > 1e-12) + n_nonzero_weights += np.sum( + np.asarray(weights) < 1.0 / len(self.estimators_) - 1e-12) + self.learner_info_.update(n_nonzero_weights=n_nonzero_weights) + + return weights + + def _solve_qp(self, objective_matrix, objective_vector, equality_matrix, + equality_vector, lower_bound, upper_bound): + try: + qp = ConvexProgram() + qp.quadratic_func, qp.linear_func = objective_matrix, objective_vector + qp.add_equality_constraints(equality_matrix, equality_vector) + qp.add_lower_bound(lower_bound) + qp.add_upper_bound(upper_bound) + return qp.solve() + + except Exception: + # logger.warning("Error while solving the quadratic program.") + raise + + +class RegularizedBinaryMinCqClassifier(MinCqClassifier): + """MinCq, version published in [1] and [2], where the regularization comes from the enforced quasi-uniformity + of the posterior distributino on the symmetric hypothesis space. This version only works with {-1, 1} labels. + + [1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) + [2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2015) + + """ + + def fit(self, X, y): + import time + beg = time.time() + # We first fit and learn the weights. + super().fit(X, y) + + # Validations + if isinstance(y, np.ma.MaskedArray): + assert len(self.classes_[np.where(np.logical_not( + self.classes_.mask))]) == 2, "RegularizedBinaryMinCqClassifier: only supports binary classification." + else: + assert len( + self.classes_), "RegularizedBinaryMinCqClassifier: only supports binary classification." + + # Then we "reverse" the negative weights and their associated voter's output. + for i, weight in enumerate(self.weights): + if weight < 0: + # logger.debug("Reversing decision of a binary voter") + self.weights[i] *= -1 + self.estimators_[i].reverse_decision() + end = time.time() + self.train_time = end - beg + return self + + def _solve(self, X, y): + if isinstance(y, np.ma.MaskedArray): + y = np.ma.MaskedArray(self.le_.transform(y), y.mask) + else: + y = self.le_.transform(y) + + classification_matrix = self._binary_classification_matrix(X) + n_examples, n_voters = np.shape(classification_matrix) + + if self.zeta == 0: + np.transpose(classification_matrix) + ftf = np.dot(np.transpose(classification_matrix), + classification_matrix) + else: + I = np.eye(n_examples) + L = build_laplacian(X, n_neighbors=self.n_neighbors) + ftf = classification_matrix.T.dot( + I + (self.zeta / n_examples) * L).dot(classification_matrix) + + # We use {-1, 1} labels. + binary_labels = np.ma.copy(y) + binary_labels[np.ma.where(y == 0)] = -1 + + # Objective function. + ftf_mean = np.mean(ftf, axis=1) + objective_matrix = 2.0 / n_examples * ftf + objective_vector = -1.0 / n_examples * ftf_mean.T + + # Equality constraint: first moment of the margin fixed to mu, only using labeled examples. + if isinstance(y, np.ma.MaskedArray): + labeled = np.where(np.logical_not(y.mask))[0] + binary_labels = binary_labels[labeled] + else: + labeled = range(len(y)) + + yf = binary_labels.T.dot(classification_matrix[labeled]) + yf_mean = np.mean(yf) + equality_matrix = 2.0 / len(labeled) * yf + equality_vector = self.mu + 1.0 / len(labeled) * yf_mean + + # Lower and upper bounds (quasi-uniformity constraints) + lower_bound = 0.0 + upper_bound = 1.0 / n_voters + + try: + weights = self._solve_qp(objective_matrix, objective_vector, + equality_matrix, equality_vector, + lower_bound, upper_bound) + except ValueError as e: + if "domain error" in e.args: + weights = np.ones(len(self.estimators_)) + + # Keep learning information for further use. + self.learner_info_ = {} + + # We count the number of non-zero weights, including the implicit voters. + # TODO: Verify how we define non-zero weights here, could be if the weight is near 1/2n. + n_nonzero_weights = np.sum(np.asarray(weights) > 1e-12) + n_nonzero_weights += np.sum( + np.asarray(weights) < 1.0 / len(self.estimators_) - 1e-12) + self.learner_info_.update(n_nonzero_weights=n_nonzero_weights) + + # Conversion of the weights of the n first voters to weights on the implicit 2n voters. + # See Section 7.1 of [2] for an explanation. + # return np.array([2 * q - 1.0 / len(self.estimators_) for q in weights]) + return np.array(weights) + + +def build_laplacian(X, n_neighbors=None): + clf = SpectralEmbedding(n_neighbors=n_neighbors) + clf.fit(X) + w = clf.affinity_matrix_ + laplacian = graph_laplacian(w, normed=True) + return laplacian diff --git a/summit/multiview_platform/monoview_classifiers/cb_boost.py b/summit/multiview_platform/monoview_classifiers/cb_boost.py new file mode 100644 index 00000000..473fcc2c --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/cb_boost.py @@ -0,0 +1,102 @@ +from .additions.CBBoostUtils import CBBoostClassifier +from ..monoview.monoview_utils import BaseMonoviewClassifier, CustomRandint + + +classifier_class_name = "CBBoost" + +class CBBoost(CBBoostClassifier, BaseMonoviewClassifier): + """ + + Parameters + ---------- + random_state : int seed, RandomState instance, or None (default=None) + The seed of the pseudo random number generator to use when + shuffling the data. + + n_max_iterations : + + n_stumps : + + kwargs : others arguments + + Attributes + ---------- + param_names : names of parameter used for hyper parameter search + + distribs : + + classed_params : + + weird_strings : + + """ + def __init__(self, random_state=None, n_max_iterations=500, n_stumps=1, + **kwargs): + + super(CBBoost, self).__init__(n_max_iterations=n_max_iterations, + random_state=random_state, + self_complemented=True, + twice_the_same=False, + random_start=False, + n_stumps=n_stumps, + c_bound_sol=True, + estimators_generator="Stumps", + mincq_tracking=False + ) + self.param_names = ["n_max_iterations", "n_stumps", "random_state"] + self.distribs = [CustomRandint(low=2, high=500), [n_stumps], + [random_state]] + self.classed_params = [] + self.weird_strings = {} + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # + # Returns + # ------- + # True + # """ + # return True + + + def get_interpretation(self, directory, y_test, multi_class=False): + """ + return interpretation string + + Parameters + ---------- + + directory : + + y_test : + + Returns + ------- + + """ + return self.getInterpretCBBoost(directory, y_test) + + def get_name_for_fusion(self): + """ + + Returns + ------- + string name of fusion + """ + return "CBB" + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"n_stumps": args.CBB_stumps, +# "n_max_iterations": args.CBB_n_iter} +# return kwargsDict + + +def paramsToSet(nIter, random_state): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet diff --git a/summit/multiview_platform/monoview_classifiers/cq_boost.py b/summit/multiview_platform/monoview_classifiers/cq_boost.py new file mode 100644 index 00000000..7effe87e --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/cq_boost.py @@ -0,0 +1,76 @@ +import numpy as np + +from .additions.BoostUtils import getInterpretBase +from .additions.CQBoostUtils import ColumnGenerationClassifier +from ..monoview.monoview_utils import CustomUniform, CustomRandint, \ + BaseMonoviewClassifier + +classifier_class_name = "CQBoost" + +class CQBoost(ColumnGenerationClassifier, BaseMonoviewClassifier): + + def __init__(self, random_state=None, mu=0.01, epsilon=1e-06, n_stumps=1, + n_max_iterations=None, estimators_generator="Stumps", + max_depth=1, **kwargs): + super(CQBoost, self).__init__( + random_state=random_state, + mu=mu, + epsilon=epsilon, + estimators_generator=estimators_generator, + n_max_iterations=n_max_iterations, + max_depth=max_depth + ) + self.param_names = ["mu", "epsilon", "n_stumps", "random_state", + "n_max_iterations", "estimators_generator", + "max_depth"] + self.distribs = [CustomUniform(loc=0.5, state=1.0, multiplier="e-"), + CustomRandint(low=1, high=15, multiplier="e-"), + [n_stumps], [random_state], [n_max_iterations], + ["Stumps", "Trees"], CustomRandint(low=1, high=5)] + self.classed_params = [] + self.weird_strings = {} + self.n_stumps = n_stumps + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + # def canProbas(self): + # """Used to know if the classifier can return label probabilities""" + # return False + + def get_interpretation(self, directory, y_test, multi_class=False): + np.savetxt(directory + "train_metrics.csv", self.train_metrics, + delimiter=',') + np.savetxt(directory + "c_bounds.csv", self.c_bounds, + delimiter=',') + np.savetxt(directory + "y_test_step.csv", self.step_decisions, + delimiter=',') + step_metrics = [] + for step_index in range(self.step_decisions.shape[1] - 1): + step_metrics.append(self.plotted_metric.score(y_test, + self.step_decisions[:, + step_index])) + step_metrics = np.array(step_metrics) + np.savetxt(directory + "step_test_metrics.csv", step_metrics, + delimiter=',') + return getInterpretBase(self, directory, "CQBoost", self.weights_, + y_test) + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"mu": args.CQB_mu, +# "epsilon": args.CQB_epsilon, +# "n_stumps": args.CQB_stumps, +# "n_max_iterations": args.CQB_n_iter} +# return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 10 ** -randomState.uniform(0.5, 1.5), + "epsilon": 10 ** -randomState.randint(1, 15)}) + return paramsSet diff --git a/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py b/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py new file mode 100644 index 00000000..f6f901ac --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/imbalance_bagging.py @@ -0,0 +1,29 @@ +from imblearn.ensemble import BalancedBaggingClassifier +from sklearn.tree import DecisionTreeClassifier + +from ..monoview.monoview_utils import BaseMonoviewClassifier, CustomRandint, CustomUniform +from ..utils.base import base_boosting_estimators + +classifier_class_name = "ImbalanceBagging" + +class ImbalanceBagging(BaseMonoviewClassifier, BalancedBaggingClassifier): + + def __init__(self, random_state=None, base_estimator="DecisionTreeClassifier", + n_estimators=10, sampling_strategy="auto", + replacement=False, base_estimator_config=None): + base_estimator = self.get_base_estimator(base_estimator, + base_estimator_config) + super(ImbalanceBagging, self).__init__(random_state=random_state, base_estimator=base_estimator, + n_estimators=n_estimators, + sampling_strategy=sampling_strategy, + replacement=replacement) + + self.param_names = ["n_estimators", "base_estimator", "sampling_strategy",] + self.classed_params = ["base_estimator"] + self.distribs = [CustomRandint(low=1, high=50), + base_boosting_estimators, + ["auto"]] + self.weird_strings = {"base_estimator": "class_name"} + + + diff --git a/summit/multiview_platform/monoview_classifiers/scm.py b/summit/multiview_platform/monoview_classifiers/scm.py new file mode 100644 index 00000000..70a1c97d --- /dev/null +++ b/summit/multiview_platform/monoview_classifiers/scm.py @@ -0,0 +1,93 @@ +from pyscm.scm import SetCoveringMachineClassifier as scm + +from ..monoview.monoview_utils import CustomRandint, CustomUniform, \ + BaseMonoviewClassifier + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +# class Decis +classifier_class_name = "SCM" + +class SCM(scm, BaseMonoviewClassifier): + """ + SCM Classifier + Parameters + ---------- + random_state (default : None) + model_type : string (default: "conjunction") + max_rules : int number maximum of rules (default : 10) + p : float value(default : 0.1 ) + + kwarg : others arguments + + Attributes + ---------- + param_names + + distribs + + classed_params + + weird_strings + + """ + + def __init__(self, random_state=None, model_type="conjunction", + max_rules=10, p=0.1, **kwargs): + """ + + Parameters + ---------- + random_state + model_type + max_rules + p + kwargs + """ + super(SCM, self).__init__( + random_state=random_state, + model_type=model_type, + max_rules=max_rules, + p=p + ) + self.param_names = ["model_type", "max_rules", "p", "random_state"] + self.distribs = [["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), [random_state]] + self.classed_params = [] + self.weird_strings = {} + + # def canProbas(self): + # """ + # Used to know if the classifier can return label probabilities + # + # Returns + # ------- + # return False in any case + # """ + # return False + + def get_interpretation(self, directory, y_test, multi_class=False): + interpretString = "Model used : " + str(self.model_) + return interpretString + + +# def formatCmdArgs(args): +# """Used to format kwargs for the parsed args""" +# kwargsDict = {"model_type": args.SCM_model_type, +# "p": args.SCM_p, +# "max_rules": args.SCM_max_rules} +# return kwargsDict + + +def paramsToSet(nIter, random_state): + paramsSet = [] + for _ in range(nIter): + paramsSet.append( + {"model_type": random_state.choice(["conjunction", "disjunction"]), + "max_rules": random_state.randint(1, 15), + "p": random_state.random_sample()}) + return paramsSet diff --git a/summit/multiview_platform/multiview_classifiers/additions/kernel_learning.py b/summit/multiview_platform/multiview_classifiers/additions/kernel_learning.py new file mode 100644 index 00000000..842c0c7b --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/additions/kernel_learning.py @@ -0,0 +1,103 @@ +from sklearn.metrics import pairwise +import numpy as np + +from ...multiview.multiview_utils import BaseMultiviewClassifier +from ...utils.hyper_parameter_search import CustomUniform, CustomRandint +from ...utils.transformations import sign_labels, unsign_labels +from ...utils.dataset import get_examples_views_indices + +class KernelClassifier(BaseMultiviewClassifier): + + def __init__(self, random_state=None,): + super().__init__(random_state) + + # def _compute_kernels(self, X, example_indices, view_indices, ): + # new_X = {} + # for index, (kernel_function, kernel_config, view_index) in enumerate( + # zip(self.kernel_functions, self.kernel_configs, view_indices)): + # new_X[index] = kernel_function(X.get_v(view_index, + # example_indices), + # **kernel_config) + # return new_X + + def format_X(self, X, example_indices, view_indices): + example_indices, view_indices = get_examples_views_indices(X, + example_indices, + view_indices) + formatted_X = dict((index, X.get_v(view_index, example_indices=example_indices)) + for index, view_index in enumerate(view_indices)) + + return formatted_X, example_indices + + def extract_labels(self, predicted_labels): + signed_labels = np.sign(predicted_labels) + return unsign_labels(signed_labels) + + def init_kernels(self, nb_view=2, ): + if isinstance(self.kernel, KernelDistribution): + self.kernel = self.kernel.draw(nb_view) + elif isinstance(self.kernel, str): + self.kernel = [self.kernel + for _ in range(nb_view)] + elif isinstance(self.kernel, list): + pass + + if isinstance(self.kernel_params, KernelConfigDistribution): + self.kernel_params = self.kernel_params.draw(nb_view) + self.kernel_params = [kernel_config[kernel_name] + for kernel_config, kernel_name + in zip(self.kernel_params, + self.kernel)] + + elif isinstance(self.kernel_params, dict): + self.kernel_params = [self.kernel_params for _ in range(nb_view)] + else: + pass + + +class KernelConfigGenerator: + + def __init__(self): + pass + + def rvs(self, random_state=None): + return KernelConfigDistribution(seed=random_state.randint(1)) + + +class KernelConfigDistribution: + + def __init__(self, seed=42): + self.random_state=np.random.RandomState(seed) + self.possible_config = { + "additive_chi2": {"gamma": CustomUniform()}, + "rbf": {"gamma": CustomUniform()}, + "poly":{"degree": CustomRandint(1,4), "gamma":CustomUniform()} + } + + def draw(self, nb_view): + drawn_params = [{} for _ in range(nb_view)] + for view_index in range(nb_view): + for kernel_name, params_dict in self.possible_config.items(): + drawn_params[view_index][kernel_name] = {} + for param_name, distrib in params_dict.items(): + drawn_params[view_index][kernel_name][param_name] = distrib.rvs(self.random_state) + return drawn_params + + +class KernelGenerator: + + def __init__(self): + pass + + def rvs(self, random_state=None): + return KernelDistribution(seed=random_state.randint(1)) + + +class KernelDistribution: + + def __init__(self, seed=42): + self.random_state=np.random.RandomState(seed) + self.available_kernels = ["rbf"] + + def draw(self, nb_view): + return list(self.random_state.choice(self.available_kernels, nb_view)) diff --git a/summit/multiview_platform/multiview_classifiers/lp_norm_mkl.py b/summit/multiview_platform/multiview_classifiers/lp_norm_mkl.py new file mode 100644 index 00000000..94eae6f2 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/lp_norm_mkl.py @@ -0,0 +1,40 @@ + +from multimodal.kernels.lpMKL import MKL + +from ..multiview.multiview_utils import BaseMultiviewClassifier, FakeEstimator +from .additions.kernel_learning import KernelClassifier, KernelConfigGenerator, KernelGenerator +from ..utils.hyper_parameter_search import CustomUniform, CustomRandint + + +classifier_class_name = "LPNormMKL" + +class LPNormMKL(KernelClassifier, MKL): + def __init__(self, random_state=None, lmbda=0.1, nystrom_param=1, n_loops=50, + precision=0.0001, use_approx=True, kernel="rbf", + kernel_params=None): + KernelClassifier.__init__(self, random_state) + MKL.__init__(self, lmbda, nystrom_param=nystrom_param, + kernel=kernel, + n_loops=n_loops, + precision=precision, + use_approx=use_approx, + kernel_params=kernel_params) + self.param_names = ["lmbda", "kernel", "kernel_params"] + self.distribs = [CustomUniform(), ['rbf', 'additive_chi2', 'poly' ], + KernelConfigGenerator()] + + def fit(self, X, y, train_indices=None, view_indices=None): + formatted_X, train_indices = self.format_X(X, train_indices, view_indices) + # try: + self.init_kernels(nb_view=len(formatted_X)) + # except: + # return FakeEstimator() + + return MKL.fit(self, formatted_X, y[train_indices]) + + def predict(self, X, example_indices=None, view_indices=None): + new_X, _ = self.format_X(X, example_indices, view_indices) + return self.extract_labels(MKL.predict(self, new_X)) + + + diff --git a/summit/multiview_platform/multiview_classifiers/mucombo.py b/summit/multiview_platform/multiview_classifiers/mucombo.py new file mode 100644 index 00000000..ec4973e3 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mucombo.py @@ -0,0 +1,48 @@ +from sklearn.tree import DecisionTreeClassifier + + +from multimodal.boosting.cumbo import MuCumboClassifier +from ..multiview.multiview_utils import BaseMultiviewClassifier +from ..utils.hyper_parameter_search import CustomRandint +from ..utils.dataset import get_examples_views_indices +from ..utils.base import base_boosting_estimators + +classifier_class_name = "MuCumbo" + + +class MuCumbo(BaseMultiviewClassifier, MuCumboClassifier): + + def __init__(self, base_estimator=None, + n_estimators=50, + random_state=None,**kwargs): + BaseMultiviewClassifier.__init__(self, random_state) + base_estimator = self.set_base_estim_from_dict(base_estimator, **kwargs) + MuCumboClassifier.__init__(self, base_estimator=base_estimator, + n_estimators=n_estimators, + random_state=random_state,) + self.param_names = ["base_estimator", "n_estimators", "random_state",] + self.distribs = [base_boosting_estimators, + CustomRandint(5,200), [random_state],] + + def fit(self, X, y, train_indices=None, view_indices=None): + train_indices, view_indices = get_examples_views_indices(X, + train_indices, + view_indices) + self.used_views = view_indices + numpy_X, view_limits = X.to_numpy_array(example_indices=train_indices, + view_indices=view_indices) + return MuCumboClassifier.fit(self, numpy_X, y[train_indices], + view_limits) + + def predict(self, X, example_indices=None, view_indices=None): + example_indices, view_indices = get_examples_views_indices(X, + example_indices, + view_indices) + self._check_views(view_indices) + numpy_X, view_limits = X.to_numpy_array(example_indices=example_indices, + view_indices=view_indices) + return MuCumboClassifier.predict(self, numpy_X) + + def get_interpretation(self, directory, base_file_name, labels, + multiclass=False): + return "" diff --git a/summit/multiview_platform/multiview_classifiers/mumbo.py b/summit/multiview_platform/multiview_classifiers/mumbo.py new file mode 100644 index 00000000..0fc63fb4 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mumbo.py @@ -0,0 +1,105 @@ +from sklearn.tree import DecisionTreeClassifier +import numpy as np +import os + +from multimodal.boosting.mumbo import MumboClassifier + +from ..multiview.multiview_utils import BaseMultiviewClassifier +from ..utils.hyper_parameter_search import CustomRandint +from ..utils.dataset import get_examples_views_indices +from ..utils.base import base_boosting_estimators +from ..utils.organization import secure_file_path +from .. import monoview_classifiers + +classifier_class_name = "Mumbo" + +class Mumbo(BaseMultiviewClassifier, MumboClassifier): + + def __init__(self, base_estimator=None, + n_estimators=50, + random_state=None, + best_view_mode="edge", **kwargs): + BaseMultiviewClassifier.__init__(self, random_state) + base_estimator = self.set_base_estim_from_dict(base_estimator, **kwargs) + MumboClassifier.__init__(self, base_estimator=base_estimator, + n_estimators=n_estimators, + random_state=random_state, + best_view_mode=best_view_mode) + self.param_names = ["base_estimator", "n_estimators", "random_state", "best_view_mode"] + self.distribs = [base_boosting_estimators, + CustomRandint(5,200), [random_state], ["edge", "error"]] + + def set_params(self, base_estimator=None, **params): + """ + Sets the base estimator from a dict. + :param base_estimator: + :param params: + :return: + """ + if base_estimator is None: + self.base_estimator = DecisionTreeClassifier() + elif isinstance(base_estimator, dict): + self.base_estimator = self.set_base_estim_from_dict(base_estimator) + MumboClassifier.set_params(self, **params) + else: + MumboClassifier.set_params(self, base_estimator=base_estimator, **params) + + + def fit(self, X, y, train_indices=None, view_indices=None): + train_indices, view_indices = get_examples_views_indices(X, + train_indices, + view_indices) + self.used_views = view_indices + self.view_names = [X.get_view_name(view_index) + for view_index in view_indices] + numpy_X, view_limits = X.to_numpy_array(example_indices=train_indices, + view_indices=view_indices) + self.view_shapes = [view_limits[ind+1]-view_limits[ind] + for ind in range(len(self.used_views)) ] + + return MumboClassifier.fit(self, numpy_X, y[train_indices], + view_limits) + + def predict(self, X, example_indices=None, view_indices=None): + example_indices, view_indices = get_examples_views_indices(X, + example_indices, + view_indices) + self._check_views(view_indices) + numpy_X, view_limits = X.to_numpy_array(example_indices=example_indices, + view_indices=view_indices) + return MumboClassifier.predict(self, numpy_X) + + def get_interpretation(self, directory, base_file_name, labels, multiclass=False): + self.view_importances = np.zeros(len(self.used_views)) + self.feature_importances_ = [np.zeros(view_shape) + for view_shape in self.view_shapes] + for best_view, estimator_weight, estimator in zip(self.best_views_, self.estimator_weights_, self.estimators_): + self.view_importances[best_view] += estimator_weight + if hasattr(estimator, "feature_importances_"): + self.feature_importances_[best_view] += estimator.feature_importances_ + importances_sum = sum([np.sum(feature_importances) + for feature_importances + in self.feature_importances_]) + self.feature_importances_ = [feature_importances/importances_sum + for feature_importances + in self.feature_importances_] + for feature_importances, view_name in zip(self.feature_importances_, self.view_names): + secure_file_path(os.path.join(directory, "feature_importances", + base_file_name+view_name+"-feature_importances.csv")) + np.savetxt(os.path.join(directory, "feature_importances", + base_file_name+view_name+"-feature_importances.csv"), + feature_importances, delimiter=',') + self.view_importances /= np.sum(self.view_importances) + np.savetxt(os.path.join(directory, base_file_name+"view_importances.csv"), self.view_importances, + delimiter=',') + + sorted_view_indices = np.argsort(-self.view_importances) + interpret_string = "Mumbo used {} iterations to converge.".format(self.best_views_.shape[0]) + interpret_string+= "\n\nViews importance : \n" + for view_index in sorted_view_indices: + interpret_string+="- View {} ({}), importance {}\n".format(view_index, + self.view_names[view_index], + self.view_importances[view_index]) + interpret_string +="\n The boosting process selected views : \n" + ", ".join(map(str, self.best_views_)) + interpret_string+="\n\n With estimator weights : \n"+ "\n".join(map(str,self.estimator_weights_/np.sum(self.estimator_weights_))) + return interpret_string diff --git a/summit/multiview_platform/multiview_classifiers/mvml.py b/summit/multiview_platform/multiview_classifiers/mvml.py new file mode 100644 index 00000000..2e385761 --- /dev/null +++ b/summit/multiview_platform/multiview_classifiers/mvml.py @@ -0,0 +1,522 @@ + +from multimodal.kernels.mvml import MVML + +from ..multiview.multiview_utils import BaseMultiviewClassifier, FakeEstimator +from .additions.kernel_learning import KernelClassifier, KernelConfigGenerator, KernelGenerator +from ..utils.hyper_parameter_search import CustomUniform, CustomRandint + + +classifier_class_name = "MVMLClassifier" + + +class MVMLClassifier(KernelClassifier, MVML): + + def __init__(self, random_state=None, lmbda=0.1, eta=0.1, nystrom_param=1, + n_loops=50, + precision=0.0001, learn_A=0, kernel="rbf", learn_w=0, + kernel_params=None): + KernelClassifier.__init__(self, random_state) + MVML.__init__(self, lmbda=lmbda, eta=eta, + nystrom_param=nystrom_param, + kernel=kernel, + n_loops=n_loops, + precision=precision, + learn_A=learn_A, + learn_w=learn_w, + kernel_params=kernel_params) + self.param_names = ["lmbda", "eta", "nystrom_param", "learn_A", + "learn_w", "n_loops", "kernel_params", "kernel", + "precision"] + self.distribs = [CustomUniform(), + CustomUniform(), + CustomUniform(), + [1,3,4], + [0,1], + CustomRandint(low=5, high=25), + KernelConfigGenerator(), + ['rbf', 'additive_chi2', 'poly' ], + CustomRandint(low=3, high=6, multiplier="e-")] + + def fit(self, X, y, train_indices=None, view_indices=None): + formatted_X, train_indices = self.format_X(X, train_indices, view_indices) + try: + self.init_kernels(nb_view=len(formatted_X)) + except: + return FakeEstimator() + return MVML.fit(self, formatted_X, y[train_indices]) + + def predict(self, X, example_indices=None, view_indices=None): + new_X, _ = self.format_X(X, example_indices, view_indices) + return self.extract_labels(MVML.predict(self, new_X)) + + +# class MVML(BaseEstimator, ClassifierMixin): +# r""" +# The MVML Classifier +# +# Parameters +# ---------- +# regression_params: array/list of regression parameters, first for basic regularization, second for +# regularization of A (not necessary if A is not learned) +# +# nystrom_param: value between 0 and 1 indicating level of nyström approximation; 1 = no approximation +# +# learn_A : integer (default 1) choose if A is learned or not: 1 - yes (default); +# 2 - yes, sparse; 3 - no (MVML_Cov); 4 - no (MVML_I) +# +# learn_w : integer (default 0) where learn w is needed +# +# n_loops : (default 0) number of itterions +# +# +# Attributes +# ---------- +# reg_params : array/list of regression parameters +# +# learn_A : 1 where Learn matrix A is needded +# +# learn_w : integer where learn w is needed +# +# n_loops : number of itterions +# +# n_approx : number of samples in approximation, equals n if no approx. +# +# X_ : :class:`metriclearning.datasets.data_sample.Metriclearn_array` array of input sample +# +# y_ : array-like, shape = (n_samples,) +# Target values (class labels). +# +# """ +# +# def __init__(self, regression_params, nystrom_param, learn_A=1, learn_w=0, n_loops=6): +# +# # calculate nyström approximation (if used) +# self.nystrom_param = nystrom_param +# +# self.reg_params = regression_params +# self.learn_A = learn_A +# self.learn_w = learn_w +# self.n_loops = n_loops +# +# def fit(self, X, y= None, views_ind=None): +# """ +# Fit the MVML classifier +# Parameters +# ---------- +# +# X : Metriclearn_array {array-like, sparse matrix}, shape = (n_samples, n_features) +# Training multi-view input samples. +# +# +# y : array-like, shape = (n_samples,) +# Target values (class labels). +# array of length n_samples containing the classification/regression labels +# for training data +# +# views_ind : array-like (default=[0, n_features//2, n_features]) +# Paramater specifying how to extract the data views from X: +# +# - If views_ind is a 1-D array of sorted integers, the entries +# indicate the limits of the slices used to extract the views, +# where view ``n`` is given by +# ``X[:, views_ind[n]:views_ind[n+1]]``. +# +# With this convention each view is therefore a view (in the NumPy +# sense) of X and no copy of the data is done. +# +# +# Returns +# ------- +# +# self : object +# Returns self. +# """ +# # Check that X and y have correct shape +# +# # Store the classes seen during fit +# if isinstance(X, Metriclearn_array): +# self.X_ = X +# elif isinstance(X, np.ndarray) : +# self.X_= Metriclearn_array(X, views_ind) +# elif isinstance(X, dict): +# self.X_= Metriclearn_array(X) +# else: +# raise TypeError("Input format is not reconized") +# check_X_y(self.X_, y) +# self.classes_ = unique_labels(y) +# self.y_ = y +# +# # n = X[0].shape[0] +# n = self.X_.shape[0] +# self.n_approx = int(np.floor(self.nystrom_param * n)) # number of samples in approximation, equals n if no approx. +# +# if self.nystrom_param < 1: +# self._calc_nystrom(self.X_) +# else: +# self.U_dict = self.X_.to_dict() +# +# # Return the classifier +# self.learn_mvml(learn_A=self.learn_A, learn_w=self.learn_w, n_loops=self.n_loops) +# return self +# +# def learn_mvml(self, learn_A=1, learn_w=0, n_loops=6): +# """ +# +# Parameters +# ---------- +# learn_A: int choose if A is learned or not (default: 1): +# 1 - yes (default); +# 2 - yes, sparse; +# 3 - no (MVML_Cov); +# 4 - no (MVML_I) +# learn_w: int choose if w is learned or not (default: 0): +# 0 - no (uniform 1/views, default setting), +# 1 - yes +# n_loops: int maximum number of iterations in MVML, (default: 6) +# usually something like default 6 is already converged +# +# Returns +# ------- +# tuple (A, g, w) with A (metrcic matrix - either fixed or learned), +# g (solution to learning problem), +# w (weights - fixed or learned) +# """ +# views = len(self.U_dict) +# n = self.U_dict[0].shape[0] +# lmbda = self.reg_params[0] +# if learn_A < 3: +# eta = self.reg_params[1] +# +# # ========= initialize A ========= +# +# # positive definite initialization (with multiplication with the U matrices if using approximation) +# A = np.zeros((views * self.n_approx, views * self.n_approx)) +# if learn_A < 3: +# for v in range(views): +# if self.nystrom_param < 1: +# A[v * self.n_approx:(v + 1) * self.n_approx, v * self.n_approx:(v + 1) * self.n_approx] = \ +# np.dot(np.transpose(self.U_dict[v]), self.U_dict[v]) +# else: +# A[v * self.n_approx:(v + 1) * self.n_approx, v * self.n_approx:(v + 1) * self.n_approx] = np.eye(n) +# # otherwise initialize like this if using MVML_Cov +# elif learn_A == 3: +# for v in range(views): +# for vv in range(views): +# if self.nystrom_param < 1: +# A[v * self.n_approx:(v + 1) * self.n_approx, vv * self.n_approx:(vv + 1) * self.n_approx] = \ +# np.dot(np.transpose(self.U_dict[v]), self.U_dict[vv]) +# else: +# A[v * self.n_approx:(v + 1) * self.n_approx, vv * self.n_approx:(vv + 1) * self.n_approx] = \ +# np.eye(n) +# # or like this if using MVML_I +# elif learn_A == 4: +# for v in range(views): +# if self.nystrom_param < 1: +# A[v * self.n_approx:(v + 1) * self.n_approx, v * self.n_approx:(v + 1) * self.n_approx] = \ +# np.eye(self.n_approx) +# else: +# # it might be wise to make a dedicated function for MVML_I if using no approximation +# # - numerical errors are more probable this way using inverse +# A[v * self.n_approx:(v + 1) * self.n_approx, v * self.n_approx:(v + 1) * self.n_approx] = \ +# np.linalg.pinv(self.U_dict[v]) # U_dict holds whole kernels if no approx +# +# # ========= initialize w, allocate g ========= +# w = (1 / views) * np.ones((views, 1)) +# g = np.zeros((views * self.n_approx, 1)) +# +# # ========= learn ========= +# loop_counter = 0 +# while True: +# +# if loop_counter > 0: +# g_prev = np.copy(g) +# A_prev = np.copy(A) +# w_prev = np.copy(w) +# +# # ========= update g ========= +# +# # first invert A +# try: +# A_inv = np.linalg.pinv(A + 1e-09 * np.eye(views * self.n_approx)) +# except np.linalg.linalg.LinAlgError: +# try: +# A_inv = np.linalg.pinv(A + 1e-06 * np.eye(views * self.n_approx)) +# except ValueError: +# return A_prev, g_prev +# except ValueError: +# return A_prev, g_prev +# +# # then calculate g (block-sparse multiplications in loop) using A_inv +# for v in range(views): +# for vv in range(views): +# A_inv[v * self.n_approx:(v + 1) * self.n_approx, vv * self.n_approx:(vv + 1) * self.n_approx] = \ +# w[v] * w[vv] * np.dot(np.transpose(self.U_dict[v]), self.U_dict[vv]) + \ +# lmbda * A_inv[v * self.n_approx:(v + 1) * self.n_approx, +# vv * self.n_approx:(vv + 1) * self.n_approx] +# g[v * self.n_approx:(v + 1) * self.n_approx, 0] = np.dot(w[v] * np.transpose(self.U_dict[v]), self.y_) +# +# try: +# g = np.dot(np.linalg.pinv(A_inv), g) # here A_inv isn't actually inverse of A (changed in above loop) +# except np.linalg.linalg.LinAlgError: +# g = np.linalg.solve(A_inv, g) +# +# # ========= check convergence ========= +# +# if learn_A > 2 and learn_w != 1: # stop at once if only g is to be learned +# break +# +# if loop_counter > 0: +# +# # convergence criteria +# g_diff = np.linalg.norm(g - g_prev) / np.linalg.norm(g_prev) +# A_diff = np.linalg.norm(A - A_prev, ord='fro') / np.linalg.norm(A_prev, ord='fro') +# if g_diff < 1e-4 and A_diff < 1e-4: +# break +# +# if loop_counter >= n_loops: # failsafe +# break +# +# # ========= update A ========= +# if learn_A == 1: +# A = self._learn_A_func(A, g, lmbda, eta) +# elif learn_A == 2: +# A = self._learn_blocksparse_A(A, g, views, self.n_approx, lmbda, eta) +# +# # ========= update w ========= +# if learn_w == 1: +# Z = np.zeros((n, views)) +# for v in range(views): +# Z[:, v] = np.dot(self.U_dict[v], g[v * self.n_approx:(v + 1) * self.n_approx]).ravel() +# w = np.dot(np.linalg.pinv(np.dot(np.transpose(Z), Z)), np.dot(np.transpose(Z), self.y_)) +# +# loop_counter += 1 +# self.g = g +# self.w = w +# self.A = A +# return A, g, w +# +# +# def predict(self, X, views_ind=None): +# """ +# +# Parameters +# ---------- +# X +# +# Returns +# ------- +# +# """ +# +# """ +# +# +# :param X: +# :return: +# """ +# if isinstance(X, Metriclearn_array): +# self.X_ = X +# elif isinstance(X, np.ndarray) : +# self.X_= Metriclearn_array(X, views_ind) +# elif isinstance(X, dict): +# self.X_= Metriclearn_array(X) +# else: +# raise TypeError("Input format is not reconized") +# check_is_fitted(self, ['X_', 'y_']) +# check_array(self.X_) +# check_is_fitted(self, ['X_', 'y_']) +# return self.predict_mvml(self.X_, self.g, self.w) +# +# def predict_mvml(self, test_kernels, g, w): +# +# """ +# :param test_kernels: dictionary of test kernels (as the dictionary of kernels in __init__) +# :param g: g, learning solution that is learned in learn_mvml +# :param w: w, weights for combining the solutions of views, learned in learn_mvml +# :return: (regression) predictions, array of size test_samples*1 +# """ +# +# views = len(self.U_dict) +# # t = test_kernels[0].shape[0] +# t = test_kernels.shape[0] +# X = np.zeros((t, views * self.n_approx)) +# for v in range(views): +# if self.nystrom_param < 1: +# X[:, v * self.n_approx:(v + 1) * self.n_approx] = w[v] * \ +# np.dot(test_kernels.get_view(v)[:, 0:self.n_approx], +# self.W_sqrootinv_dict[v]) +# else: +# X[:, v * self.n_approx:(v + 1) * self.n_approx] = w[v] * test_kernels[v] +# +# return np.dot(X, g) +# +# def _calc_nystrom(self, kernels): +# # calculates the nyström approximation for all the kernels in the given dictionary +# self.W_sqrootinv_dict = {} +# self.U_dict = {} +# for v in range(len(kernels.shapes_int)): +# kernel = kernels.get_view(v) +# E = kernel[:, 0:self.n_approx] +# W = E[0:self.n_approx, :] +# Ue, Va, _ = np.linalg.svd(W) +# vak = Va[0:self.n_approx] +# inVa = np.diag(vak ** (-0.5)) +# U_v = np.dot(E, np.dot(Ue[:, 0:self.n_approx], inVa)) +# self.U_dict[v] = U_v +# self.W_sqrootinv_dict[v] = np.dot(Ue[:, 0:self.n_approx], inVa) +# +# def _learn_A_func(self, A, g, lmbda, eta): +# +# # basic gradient descent +# +# stepsize = 0.5 +# if stepsize*eta >= 0.5: +# stepsize = 0.9*(1/(2*eta)) # make stepsize*eta < 0.5 +# +# loops = 0 +# not_converged = True +# while not_converged: +# +# A_prev = np.copy(A) +# +# A_pinv = np.linalg.pinv(A) +# A = (1-2*stepsize*eta)*A + stepsize*lmbda*np.dot(np.dot(A_pinv, g), np.dot(np.transpose(g), A_pinv)) +# +# if loops > 0: +# prev_diff = diff +# diff = np.linalg.norm(A - A_prev) / np.linalg.norm(A_prev) +# +# if loops > 0 and prev_diff > diff: +# A = A_prev +# stepsize = stepsize*0.1 +# +# if diff < 1e-5: +# not_converged = False +# +# if loops > 10: +# not_converged = False +# +# loops += 1 +# +# return A +# +# def _learn_blocksparse_A(self, A, g, views, m, lmbda, eta): +# +# # proximal gradient update method +# +# converged = False +# rounds = 0 +# +# L = lmbda * np.linalg.norm(np.dot(g, g.T)) +# # print("L ", L) +# +# while not converged and rounds < 100: +# +# # no line search - this has worked well enough experimentally +# A = self._proximal_update(A, views, m, L, g, lmbda, eta) +# +# # convergence +# if rounds > 0: +# A_diff = np.linalg.norm(A - A_prev) / np.linalg.norm(A_prev) +# +# if A_diff < 1e-3: +# converged = True +# +# A_prev = np.copy(A) +# +# rounds += 1 +# +# return A +# +# def _proximal_update(self, A_prev, views, m, L, D, lmbda, gamma): +# +# # proximal update +# +# # the inverse is not always good to compute - in that case just return the previous one and end the search +# try: +# A_prev_inv = np.linalg.pinv(A_prev) +# except np.linalg.linalg.LinAlgError: +# try: +# A_prev_inv = np.linalg.pinv(A_prev + 1e-6 * np.eye(views * m)) +# except np.linalg.linalg.LinAlgError: +# return A_prev +# except ValueError: +# return A_prev +# except ValueError: +# return A_prev +# +# if np.any(np.isnan(A_prev_inv)): +# # just in case the inverse didn't return a proper solution (happened once or twice) +# return A_prev +# +# A_tmp = A_prev + (lmbda / L) * np.dot(np.dot(A_prev_inv.T, D), np.dot(np.transpose(D), A_prev_inv.T)) +# +# # if there is one small negative eigenvalue this gets rid of it +# try: +# val, vec = np.linalg.eigh(A_tmp) +# except np.linalg.linalg.LinAlgError: +# return A_prev +# except ValueError: +# return A_prev +# val[val < 0] = 0 +# +# A_tmp = np.dot(vec, np.dot(np.diag(val), np.transpose(vec))) +# A_new = np.zeros((views*m, views*m)) +# +# # proximal update, group by group (symmetric!) +# for v in range(views): +# for vv in range(v + 1): +# if v != vv: +# if np.linalg.norm(A_tmp[v * m:(v + 1) * m, vv * m:(vv + 1) * m]) != 0: +# multiplier = 1 - gamma / (2 * np.linalg.norm(A_tmp[v * m:(v + 1) * m, vv * m:(vv + 1) * m])) +# if multiplier > 0: +# A_new[v * m:(v + 1) * m, vv * m:(vv + 1) * m] = multiplier * A_tmp[v * m:(v + 1) * m, +# vv * m:(vv + 1) * m] +# A_new[vv * m:(vv + 1) * m, v * m:(v + 1) * m] = multiplier * A_tmp[vv * m:(vv + 1) * m, +# v * m:(v + 1) * m] +# else: +# if (np.linalg.norm(A_tmp[v * m:(v + 1) * m, v * m:(v + 1) * m])) != 0: +# multiplier = 1 - gamma / (np.linalg.norm(A_tmp[v * m:(v + 1) * m, v * m:(v + 1) * m])) +# if multiplier > 0: +# A_new[v * m:(v + 1) * m, v * m:(v + 1) * m] = multiplier * A_tmp[v * m:(v + 1) * m, +# v * m:(v + 1) * m] +# +# return A_new +# +# +# from ..multiview.multiview_utils import BaseMultiviewClassifier, get_examples_views_indices +# from .additions.kernel_learning import KernelClassifier, KernelConfigGenerator, KernelGenerator +# from ..utils.hyper_parameter_search import CustomUniform, CustomRandint +# +# classifier_class_name = "MVMLClassifier" +# +# class MVMLClassifier(KernelClassifier, MVML): +# +# def __init__(self, random_state=None, reg_params=None, +# nystrom_param=1, learn_A=1, learn_w=0, n_loops=6, kernel_types="rbf_kernel", +# kernel_configs=None): +# super().__init__(random_state, kernel_types=kernel_types, +# kernel_configs=kernel_configs) +# super(BaseMultiviewClassifier, self).__init__(reg_params, +# nystrom_param, +# learn_A=learn_A, +# learn_w=learn_w, +# n_loops=n_loops) +# self.param_names = ["nystrom_param", "kernel_types", "kernel_configs", +# "learn_A", "learn_w", "n_loops", "reg_params"] +# self.distribs = [CustomUniform(), KernelGenerator(), +# KernelConfigGenerator(), CustomRandint(low=1, high=5), +# [0,1], CustomRandint(low=1, high=100), [[0.1,0.9]]] +# +# def fit(self, X, y, train_indices=None, view_indices=None): +# new_X, new_y = self._init_fit(X, y, train_indices, view_indices) +# return super(MVMLClassifier, self).fit(new_X, new_y) +# +# def predict(self, X, example_indices=None, view_indices=None): +# example_indices, view_indices = get_examples_views_indices(X, +# example_indices, +# view_indices) +# new_X = self._compute_kernels(X, example_indices, view_indices) +# print(self.extract_labels(super(MVMLClassifier, self).predict(new_X))) +# return self.extract_labels(super(MVMLClassifier, self).predict(new_X)) +# -- GitLab