diff --git a/docs/source/monomulti/monoview_classifier.ipynb b/docs/source/monomulti/monoview_classifier.ipynb index 99e590d32dabaa22cdf05f1713e27238209296f3..c388766ff2670b5883b6c7cc028990ace5640ead 100644 --- a/docs/source/monomulti/monoview_classifier.ipynb +++ b/docs/source/monomulti/monoview_classifier.ipynb @@ -85,7 +85,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2.0 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", @@ -97,4 +97,4 @@ }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} diff --git a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py index 6b99f6e458f25a40231c28930b4aad50c6426d18..c36e80a623c0a73e987b456691af40724acd5fff 100644 --- a/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py +++ b/multiview_platform/MonoMultiViewClassifiers/ExecClassif.py @@ -27,7 +27,7 @@ __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -def initBenchmark(CL_type, multiviewAlgos, monoviewAlgos, args): +def initBenchmark(CL_type, monoviewAlgos, multiviewAlgos, args): r"""Used to create a list of all the algorithm packages names used for the benchmark. First this function will check if the benchmark need mono- or/and multiview algorithms and adds to the right @@ -54,6 +54,7 @@ def initBenchmark(CL_type, multiviewAlgos, monoviewAlgos, args): benchmark = {"Monoview": {}, "Multiview": {}} allMultiviewPackages = [name for _, name, isPackage in pkgutil.iter_modules(['./MonoMultiViewClassifiers/MultiviewClassifiers/']) if isPackage] + if "Monoview" in CL_type: if monoviewAlgos == ['']: benchmark["Monoview"] = [name for _, name, isPackage in pkgutil.iter_modules(["./MonoMultiViewClassifiers/MonoviewClassifiers"]) diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index a67b45a2be4b4b51745d2bec353bd77c51bdd3f9..143988785ae0ba35d42763d7c43d338c12d472e5 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -59,6 +59,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol logging.debug("Start:\t Determine Train/Test split") X_train, y_train, X_test, y_test, X_test_multiclass = initTrainTest(X, Y, classificationIndices) + logging.debug("Info:\t Shape X_train:" + str(X_train.shape) + ", Length of y_train:" + str(len(y_train))) logging.debug("Info:\t Shape X_test:" + str(X_test.shape) + ", Length of y_test:" + str(len(y_test))) logging.debug("Done:\t Determine Train/Test split") @@ -168,7 +169,7 @@ def getHPs(classifierModule, hyperParamSearch, nIter, CL_type, X_train, y_train, def saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, y_train, imagesAnalysis): logging.info(stringAnalysis) - outputTextFile = open(outputFileName + '.txt', 'w') + outputTextFile = open(outputFileName + 'summary.txt', 'w') outputTextFile.write(stringAnalysis) outputTextFile.close() np.savetxt(outputFileName + "full_pred.csv", full_labels_pred.astype(np.int16), delimiter=",") diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/analyzeResult.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/analyzeResult.py index b88fe79b1c955067cb37d4f0a5159de90a4b05cd..927301aa6d9625f841bca5df73ec1fb3c8482e13 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/analyzeResult.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/analyzeResult.py @@ -6,7 +6,7 @@ from .. import Metrics def getDBConfigString(name, feat, classificationIndices, shape, classLabelsNames, KFolds): - learningRate = float(len(classificationIndices[0])) / len(classificationIndices[0]) + len(classificationIndices[1]) + learningRate = float(len(classificationIndices[0])) / (len(classificationIndices[0]) + len(classificationIndices[1])) dbConfigString = "Database configuration : \n" dbConfigString += "\t- Database name : " + name + "\n" dbConfigString += "\t- View name : " + feat + "\t View shape : " + str(shape) + "\n" diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py new file mode 100644 index 0000000000000000000000000000000000000000..b683dd013850911f296f06fe13924f322b929dd4 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py @@ -0,0 +1,1085 @@ +import scipy +import logging +from future.utils import iteritems +from copy import deepcopy +import numpy.ma as ma +from collections import defaultdict, OrderedDict +import pandas as pd +import sys +from functools import partial +import numpy as np +from scipy.spatial import distance +from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from sklearn.utils.validation import check_is_fitted +from sklearn.preprocessing import LabelEncoder +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics.pairwise import rbf_kernel, linear_kernel +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.model_selection import RandomizedSearchCV +from sklearn.tree import DecisionTreeClassifier +from scipy.stats import randint +import numpy as np + + + +class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None): + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + + def fit(self, X, y): + if scipy.sparse.issparse(X): + # logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + y[y == 0] = -1 + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=10, self_complemented=True) + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + # self.chosen_columns_ = [np.random.choice(np.arange(n)), np.random.choice(np.arange(n))] + self.chosen_columns_ = [] + self.n_total_hypotheses_ = n + + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) + + # Initialization + alpha = self._initialize_alphas(m) + # w = [0.5,0.5] + w= None + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # Find worst weak hypothesis given alpha. + h_values = ma.array(np.squeeze(np.array((alpha).T.dot(y_kernel_matrix).T)), fill_value=-np.inf) + h_values[self.chosen_columns_] = ma.masked + worst_h_index = ma.argmax(h_values) + # logging.info("Adding voter {} to the columns, value = {}".format(worst_h_index, h_values[worst_h_index])) + + # Check for optimal solution. We ensure at least one complete iteration is done as the initialization + # values might provide a degenerate initial solution. + if h_values[worst_h_index] <= self.dual_constraint_rhs + self.epsilon and len(self.chosen_columns_) > 0: + break + + # Append the weak hypothesis. + self.chosen_columns_.append(worst_h_index) + + # Solve restricted master for new costs. + w, alpha = self._restricted_master_problem(y_kernel_matrix[:, self.chosen_columns_], previous_w=w, previous_alpha=alpha) + + + # We collect iteration information for later evaluation. + if self.save_iteration_as_hyperparameter_each is not None: + if (k + 1) % self.save_iteration_as_hyperparameter_each == 0: + self.collected_weight_vectors_[k] = deepcopy(w) + self.collected_dual_constraint_violations_[k] = h_values[worst_h_index] - self.dual_constraint_rhs + + self.weights_ = w + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + + self.learner_info_ = {} + self.learner_info_.update(n_nonzero_weights=np.sum(np.asarray(self.weights_) > 1e-12)) + self.learner_info_.update(n_generated_columns=len(self.chosen_columns_)) + y[y == -1] = 0 + return self + + def predict(self, X): + check_is_fitted(self, 'weights_') + + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + + classification_matrix = self._binary_classification_matrix(X) + + margins = np.squeeze(np.asarray(np.dot(classification_matrix, self.weights_))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + return signs_array + + def _binary_classification_matrix(self, X): + probas = self._collect_probas(X) + predicted_labels = np.argmax(probas, axis=2) + predicted_labels[predicted_labels == 0] = -1 + values = np.max(probas, axis=2) + return (predicted_labels * values).T + + def _collect_probas(self, X): + return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) + + def _restricted_master_problem(self, y_kernel_matrix): + raise NotImplementedError("Restricted master problem not implemented.") + + def _initialize_alphas(self, n_examples): + raise NotImplementedError("Alpha weights initialization function is not implemented.") + + def evaluate_metrics(self, X, y, metrics_list=None, functions_list=None): + if metrics_list is None: + metrics_list = [zero_one_loss, zero_one_loss_per_example] + + if functions_list is None: + functions_list = [] + + # Predict, evaluate metrics. + classification_matrix = self._binary_classification_matrix(X) + predictions = sign(classification_matrix.dot(self.weights_)) + + if self.save_iteration_as_hyperparameter_each is None: + metrics_results = {} + for metric in metrics_list: + metrics_results[metric.__name__] = metric(y, predictions) + + metrics_dataframe = ResultsDataFrame([metrics_results]) + return metrics_dataframe + + # If we collected iteration informations to add a hyperparameter, we add an index with the hyperparameter name + # and return a ResultsDataFrame containing one row per hyperparameter value. + metrics_dataframe = ResultsDataFrame() + for t, weights in iteritems(self.collected_weight_vectors_): + predictions = sign(classification_matrix[:, :t + 1].dot(weights)) + metrics_results = {metric.__name__: metric(y, predictions) for metric in metrics_list} + for function in functions_list: + metrics_results[function.__name__] = function(classification_matrix[:, :t + 1], y, weights) + + # We add other collected information. + metrics_results['chosen_columns'] = self.chosen_columns_[t] + metrics_results['dual_constraint_violation'] = self.collected_dual_constraint_violations_[t] + + metrics_dataframe = metrics_dataframe.append(ResultsDataFrame([metrics_results], index=[t])) + + metrics_dataframe.index.name = 'hp__n_iterations' + return metrics_dataframe + +class CqBoostClassifier(ColumnGenerationClassifier): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None): + super(CqBoostClassifier, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each) + # TODO: Vérifier la valeur de nu (dual_constraint_rhs) à l'initialisation, mais de toute manière ignorée car + # on ne peut pas quitter la boucle principale avec seulement un votant. + self.mu = mu + + def _restricted_master_problem(self, y_kernel_matrix, previous_w=None, previous_alpha=None): + n_examples, n_hypotheses = y_kernel_matrix.shape + + m_eye = np.eye(n_examples) + m_ones = np.ones((n_examples, 1)) + + qp_a = np.vstack((np.hstack((-y_kernel_matrix, m_eye)), + np.hstack((np.ones((1, n_hypotheses)), np.zeros((1, n_examples)))))) + + qp_b = np.vstack((np.zeros((n_examples, 1)), + np.array([1.0]).reshape((1, 1)))) + + qp_g = np.vstack((np.hstack((-np.eye(n_hypotheses), np.zeros((n_hypotheses, n_examples)))), + np.hstack((np.zeros((1, n_hypotheses)), - 1.0 / n_examples * m_ones.T)))) + + qp_h = np.vstack((np.zeros((n_hypotheses, 1)), + np.array([-self.mu]).reshape((1, 1)))) + + qp = ConvexProgram() + qp.quadratic_func = 2.0 / n_examples * np.vstack((np.hstack((np.zeros((n_hypotheses, n_hypotheses)), np.zeros((n_hypotheses, n_examples)))), + np.hstack((np.zeros((n_examples, n_hypotheses)), m_eye)))) + + qp.add_equality_constraints(qp_a, qp_b) + qp.add_inequality_constraints(qp_g, qp_h) + + if previous_w is not None: + qp.initial_values = np.append(previous_w, [0]) + + try: + solver_result = qp.solve(abstol=1e-10, reltol=1e-10, feastol=1e-10, return_all_information=True) + w = np.asarray(np.array(solver_result['x']).T[0])[:n_hypotheses] + + # The alphas are the Lagrange multipliers associated with the equality constraints (returned as the y vector in CVXOPT). + dual_variables = np.asarray(np.array(solver_result['y']).T[0]) + alpha = dual_variables[:n_examples] + + # Set the dual constraint right-hand side to be equal to the last lagrange multiplier (nu). + # Hack: do not change nu if the QP didn't fully solve... + if solver_result['dual slack'] <= 1e-8: + self.dual_constraint_rhs = dual_variables[-1] + # logging.info('Updating dual constraint rhs: {}'.format(self.dual_constraint_rhs)) + + except: + logging.warning('QP Solving failed at iteration {}.'.format(n_hypotheses)) + if previous_w is not None: + w = np.append(previous_w, [0]) + else: + w = np.array([1.0 / n_hypotheses] * n_hypotheses) + + if previous_alpha is not None: + alpha = previous_alpha + else: + alpha = self._initialize_alphas(n_examples) + + return w, alpha + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class CQBoost(CqBoostClassifier): + + def __init__(self, random_state, **kwargs): + super(CQBoost, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + ) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', CqBoostClassifier())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- CQBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- CQBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = CqBoostClassifier(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"],) + # random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.choice([0.001, 0.002]), + "epsilon": randomState.choice([1e-08, 2e-08]), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.CQB_mu + kwargsDict['epsilon'] = args.CQB_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', CqBoostClassifier())]) + + +def genParamsDict(randomState): + return {"classifier__mu": [0.001, 0.002], + "classifier__epsilon": [1e-08, 2e-08], + "classifier__n_max_iterations": [None]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- CQBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- CQBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + dotted = False + interpretString = "\t CQBoost permformed classification with weights : \n" + interpretString += np.array2string(classifier.weights_, precision=4, separator=',', suppress_small=True) + interpretString += "\n \t It used {} iterations to converge".format(len(classifier.weights_)) + if len(classifier.weights_) == classifier.n_max_iterations: + interpretString += ", and used all available iterations, " + else: + dotted = True + interpretString += "." + if len(classifier.weights_) == classifier.n_total_hypotheses_: + interpretString += ", and all the voters have been used." + elif not dotted: + interpretString += "." + interpretString += "\n\t Selected voters : \n" + interpretString += str(classifier.chosen_columns_) + interpretString += "\n\t and they voted : \n" + interpretString += np.array2string(classifier.classification_matrix[:, classifier.chosen_columns_], precision=4, separator=',', suppress_small=True) + np.savetxt(directory+"voters.csv", classifier.classification_matrix[:, classifier.chosen_columns_], delimiter=',') + np.savetxt(directory + "weights.csv", classifier.weights_, delimiter=',') + return interpretString + + + + + +def _as_matrix(element): + """ Utility function to convert "anything" to a Numpy matrix. + """ + # If a scalar, return a 1x1 matrix. + if len(np.shape(element)) == 0: + return np.matrix([[element]], dtype=float) + + # If a nd-array vector, return a column matrix. + elif len(np.shape(element)) == 1: + matrix = np.matrix(element, dtype=float) + if np.shape(matrix)[1] != 1: + matrix = matrix.T + return matrix + + return np.matrix(element, dtype=float) + + +def _as_column_matrix(array_like): + """ Utility function to convert any array to a column Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[0] == 1: + matrix = matrix.T + + return matrix + + +def _as_line_matrix(array_like): + """ Utility function to convert any array to a line Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[1] == 1: + matrix = matrix.T + + return matrix + + +class ConvexProgram(object): + """ + Encapsulates a quadratic program of the following form: + + minimize (1/2)*x'*P*x + q'*x + subject to G*x <= h + A*x = b. + + + or a linear program of the following form: + + minimize c'*x + subject to G*x <= h + A*x = b + """ + def __init__(self): + self._quadratic_func = None + self._linear_func = None + self._inequality_constraints_matrix = None + self._inequality_constraints_values = None + self._equality_constraints_matrix = None + self._equality_constraints_values = None + self._lower_bound_values = None + self._upper_bound_values = None + self._n_variables = None + + @property + def n_variables(self): + return self._n_variables + + @property + def quadratic_func(self): + return self._quadratic_func + + @quadratic_func.setter + def quadratic_func(self, quad_matrix): + quad_matrix = _as_matrix(quad_matrix) + n_lines, n_columns = np.shape(quad_matrix) + assert(n_lines == n_columns) + + if self._linear_func is not None: + assert(np.shape(quad_matrix)[0] == self._n_variables) + else: + self._n_variables = n_lines + + self._quadratic_func = quad_matrix + + @property + def linear_func(self): + return self._linear_func + + @linear_func.setter + def linear_func(self, lin_vector): + if lin_vector is not None: + lin_vector = _as_column_matrix(lin_vector) + + if self._quadratic_func is not None: + assert(np.shape(lin_vector)[0] == self._n_variables) + + else: + self._n_variables = np.shape(lin_vector)[0] + + self._linear_func = lin_vector + + def add_inequality_constraints(self, inequality_matrix, inequality_values): + if inequality_matrix is None: + logging.info("Empty inequality constraint: ignoring!") + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(inequality_matrix) or len(np.shape(inequality_matrix)) == 1: + inequality_matrix = _as_line_matrix(inequality_matrix) + else: + inequality_matrix = _as_matrix(inequality_matrix) + + inequality_values = _as_column_matrix(inequality_values) + assert np.shape(inequality_matrix)[1] == self._n_variables + assert np.shape(inequality_values)[1] == 1 + + if self._inequality_constraints_matrix is None: + self._inequality_constraints_matrix = inequality_matrix + else: + self._inequality_constraints_matrix = np.append(self._inequality_constraints_matrix, + inequality_matrix, axis=0) + + if self._inequality_constraints_values is None: + self._inequality_constraints_values = inequality_values + else: + self._inequality_constraints_values = np.append(self._inequality_constraints_values, + inequality_values, axis=0) + + def add_equality_constraints(self, equality_matrix, equality_values): + if equality_matrix is None: + logging.info("Empty equality constraint: ignoring!") + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(equality_matrix) or len(np.shape(equality_matrix)) == 1: + equality_matrix = _as_line_matrix(equality_matrix) + else: + equality_matrix = _as_matrix(equality_matrix) + + equality_values = _as_matrix(equality_values) + assert np.shape(equality_matrix)[1] == self._n_variables + assert np.shape(equality_values)[1] == 1 + + if self._equality_constraints_matrix is None: + self._equality_constraints_matrix = equality_matrix + else: + self._equality_constraints_matrix = np.append(self._equality_constraints_matrix, + equality_matrix, axis=0) + + if self._equality_constraints_values is None: + self._equality_constraints_values = equality_values + else: + self._equality_constraints_values = np.append(self._equality_constraints_values, + equality_values, axis=0) + + def add_lower_bound(self, lower_bound): + if lower_bound is not None: + self._assert_objective_function_is_set() + self._lower_bound_values = np.array([lower_bound] * self._n_variables) + + def add_upper_bound(self, upper_bound): + if upper_bound is not None: + self._assert_objective_function_is_set() + self._upper_bound_values = np.array([upper_bound] * self._n_variables) + + def _convert_bounds_to_inequality_constraints(self): + self._assert_objective_function_is_set() + + if self._lower_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = -1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._lower_bound_values) + self._lower_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + if self._upper_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = 1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._upper_bound_values) + self._upper_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + def _convert_to_cvxopt_matrices(self): + from cvxopt import matrix as cvxopt_matrix + + if self._quadratic_func is not None: + self._quadratic_func = cvxopt_matrix(self._quadratic_func) + + if self._linear_func is not None: + self._linear_func = cvxopt_matrix(self._linear_func) + else: + # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! + self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) + + if self._inequality_constraints_matrix is not None: + self._inequality_constraints_matrix = cvxopt_matrix(self._inequality_constraints_matrix) + + if self._inequality_constraints_values is not None: + self._inequality_constraints_values = cvxopt_matrix(self._inequality_constraints_values) + + if self._equality_constraints_matrix is not None: + self._equality_constraints_matrix = cvxopt_matrix(self._equality_constraints_matrix) + + if self._equality_constraints_values is not None: + self._equality_constraints_values = cvxopt_matrix(self._equality_constraints_values) + + def _assert_objective_function_is_set(self): + assert self._n_variables is not None + + def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, return_all_information=False): + + # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. + original_stdout = sys.stdout + original_stderr = sys.stderr + + ret = None + + # TODO: Repair + # if solver == "cvxopt": + # stdout_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stdout_logger, logging.DEBUG) + # sys.stdout = sl + + # stderr_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stderr_logger, logging.WARNING) + # sys.stderr = sl + + try: + if solver == "cvxopt": + from cvxopt.solvers import qp, lp, options + options['feastol'] = feastol + options['abstol'] = abstol + options['reltol'] = reltol + options['show_progress'] = False + + self._convert_bounds_to_inequality_constraints() + self._convert_to_cvxopt_matrices() + + if self._quadratic_func is not None: + ret = qp(self.quadratic_func, self.linear_func, self._inequality_constraints_matrix, + self._inequality_constraints_values, self._equality_constraints_matrix, + self._equality_constraints_values) + + else: + ret = lp(self.linear_func, + G=self._inequality_constraints_matrix, + h=self._inequality_constraints_values, + A=self._equality_constraints_matrix, + b=self._equality_constraints_values) + + # logging.info("Primal objective value = {}".format(ret['primal objective'])) + # logging.info("Dual objective value = {}".format(ret['dual objective'])) + + if not return_all_information: + ret = np.asarray(np.array(ret['x']).T[0]) + + elif solver == "cplex": + import cplex + p = cplex.Cplex() + p.objective.set_sense(p.objective.sense.minimize) + + # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP + # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of + # them...), and the values (columns of the QP matrix). + names = [str(x) for x in range(self._n_variables)] + p.variables.add(names=names) + + if self.quadratic_func is not None: + p_matrix = [] + for line in self._quadratic_func: + p_matrix.append([names, line.tolist()[0]]) + + p.objective.set_quadratic(p_matrix) + + if self.linear_func is not None: + p.objective.set_linear(zip(names, + np.asarray(self.linear_func.T).reshape(self.n_variables,).tolist())) + + if self._inequality_constraints_matrix is not None: + inequality_linear = [] + for line in self._inequality_constraints_matrix: + inequality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=inequality_linear, + rhs=np.asarray(self._inequality_constraints_values.T).tolist()[0], + senses="L"*len(self._inequality_constraints_values)) + + if self._equality_constraints_matrix is not None: + equality_linear = [] + for line in self._equality_constraints_matrix: + equality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=equality_linear, + rhs=np.asarray(self._equality_constraints_values.T).tolist()[0], + senses="E"*len(self._equality_constraints_values)) + + if self._lower_bound_values is not None: + p.variables.set_lower_bounds(zip(names, self._lower_bound_values)) + + if self._upper_bound_values is not None: + p.variables.set_upper_bounds(zip(names, self._upper_bound_values)) + + p.solve() + + logging.info("Solution status = {} : {}".format(p.solution.get_status(), + p.solution.status[p.solution.get_status()])) + logging.info("Solution value = {}".format(p.solution.get_objective_value())) + + if not return_all_information: + ret = np.array(p.solution.get_values()) + else: + ret = {'primal': np.array(p.solution.get_values()), + 'dual': np.array(p.solution.get_dual_values())} + + elif solver == "pycpx": + # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). + + import pycpx + model = pycpx.CPlexModel(verbosity=2) + q = model.new(self.n_variables) + + if self._inequality_constraints_matrix is not None: + model.constrain(self._inequality_constraints_matrix * q <= self._inequality_constraints_values) + if self._equality_constraints_matrix is not None: + model.constrain(self._equality_constraints_matrix * q == self._equality_constraints_values) + if self._lower_bound_values is not None: + model.constrain(q >= self._lower_bound_values) + if self._upper_bound_values is not None: + model.constrain(q <= self._upper_bound_values) + + value = model.minimize(0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) + + logging.info("Solution value = {}".format(value)) + + if not return_all_information: + ret = np.array(model[q]) + else: + ret = model + + except: + raise + + finally: + sys.stdout = original_stdout + sys.stderr = original_stderr + + return ret + + + + + + +class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): + """Generic Attribute Threshold Binary Classifier + + Attributes + ---------- + attribute_index : int + The attribute to consider for the classification. + threshold : float + The threshold value for classification rule. + direction : int, optional + A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump + will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). + + """ + def __init__(self, attribute_index, threshold, direction=1): + super(DecisionStumpClassifier, self).__init__() + self.attribute_index = attribute_index + self.threshold = threshold + self.direction = direction + + def fit(self, X, y): + # Only verify that we are in the binary classification setting, with support for transductive learning. + if isinstance(y, np.ma.MaskedArray): + self.classes_ = np.unique(y[np.logical_not(y.mask)]) + else: + self.classes_ = np.unique(y) + + # This label encoder is there for the predict function to be able to return any two classes that were used + # when fitting, for example {-1, 1} or {0, 1}. + self.le_ = LabelEncoder() + self.le_.fit(self.classes_) + self.classes_ = self.le_.classes_ + + assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" + return self + + def predict(self, X): + """Returns the output of the classifier, on a sample X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + predictions : array-like, shape = [n_samples] + Predicted class labels. + + """ + check_is_fitted(self, 'classes_') + return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) + + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + avg : array-like, shape = [n_samples, n_classes] + Weighted average probability for each class per sample. + + """ + check_is_fitted(self, 'classes_') + X = np.asarray(X) + probas = np.zeros((X.shape[0], 2)) + positive_class = np.argwhere(X[:, self.attribute_index] > self.threshold) + negative_class = np.setdiff1d(range(X.shape[0]), positive_class) + probas[positive_class, 1] = 1.0 + probas[negative_class, 0] = 1.0 + + if self.direction == -1: + probas = 1 - probas + + return probas + + def reverse_decision(self): + self.direction *= -1 + + +class ClassifiersGenerator(BaseEstimator, TransformerMixin): + """Base class to create a set of voters using training samples, and then transform a set of examples in + the voters' output space. + + Attributes + ---------- + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + voters : ndarray of voter functions + Once fit, contains the voter functions. + + """ + def __init__(self, self_complemented=False): + super(ClassifiersGenerator, self).__init__() + self.self_complemented = self_complemented + + def fit(self, X, y=None): + """Generates the voters using training samples. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Input labels, usually determines the decision polarity of each voter. + + Returns + ------- + self + + """ + raise NotImplementedError + + def transform(self, X): + """Transforms the input points in a matrix of classification, using previously learned voters. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data to classify. + + Returns + ------- + ndarray of shape (n_samples, n_voters) + The voters' decision on each example. + + """ + check_is_fitted(self, 'estimators_') + return np.array([voter.predict(X) for voter in self.estimators_]).T + +class StumpsClassifiersGenerator(ClassifiersGenerator): + """Decision Stump Voters transformer. + + Parameters + ---------- + n_stumps_per_attribute : int, optional + Determines how many decision stumps will be created for each attribute. Defaults to 10. + No stumps will be created for attributes with only one possible value. + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + + """ + def __init__(self, n_stumps_per_attribute=10, self_complemented=False): + super(StumpsClassifiersGenerator, self).__init__(self_complemented) + self.n_stumps_per_attribute = n_stumps_per_attribute + + def fit(self, X, y): + """Fits Decision Stump voters on a training set. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Only used to ensure that we are in the binary classification setting. + + Returns + ------- + self + + """ + minimums = np.min(X, axis=0) + maximums = np.max(X, axis=0) + ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) + + self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, 1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + if self.self_complemented: + self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + self.estimators_ = np.asarray(self.estimators_) + return self + +def sign(array): + """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. + Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. + + Parameters + ---------- + array : array-like + Input values. + + Returns + ------- + ndarray + An array with the signs of input elements. + + """ + signs = np.sign(array) + + signs[array == 0] = -1 + return signs + + +def zero_one_loss(y_target, y_estimate, confidences=1): + if len(y_target) == 0: + return 0.0 + return np.mean(y_target != y_estimate) + + +def zero_one_loss_per_example(y_target, y_estimate, confidences=1): + if len(y_target) == 0: + return 0.0 + return (y_target != y_estimate).astype(np.int) + + +class ResultsDataFrame(pd.DataFrame): + """A ResultsDataFrame is a DataFrame with the following information: + + - A 'dataset' column that contains the dataset name + - Hyperparamer columns, named 'hp__HPNAME', where HPNAME is the name of the hyperparameter + - Columns containing informations about that depend on the dataset and hyperparameters, for example the risk. + + """ + @property + def datasets_list(self): + """Returns the sorted list of datasets. + + """ + return sorted(set(self['dataset'])) + + @property + def hyperparameters_list(self): + """Returns a sorted list of hyperparameter names, without the 'hp__' prefix. + + """ + return sorted(column.split('hp__')[1] for column in self.columns if column.startswith('hp__')) + + @property + def hyperparameters_list_with_prefix(self): + return sorted(column for column in self.columns if column.startswith('hp__')) + + @property + def metrics_list(self): + return sorted(column for column in self.columns if not column.startswith('hp__') and column != 'dataset') + + @property + def hyperparameters_with_values(self): + """Returns a dictionary that contains the hyperparameter names (without the 'hp__' prefix), and + associated values that are present in the DataFrame. + + """ + hyperparameters = [column for column in self.columns if column.startswith('hp__')] + + hyperparameters_dict = {} + tmp_dict = self[hyperparameters].to_dict() + + for key, value in iteritems(tmp_dict): + hyperparameters_dict[key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(set(value.values())) + + return hyperparameters_dict + + @property + def hyperparameters_with_values_per_dataset(self): + """Returns a dictionary of dictionaries that contains for each dataset, the hyperparameter names (without the + 'hp__' prefix), and associated values that are present in the DataFrame. + + """ + hyperparameters = [column for column in self.columns if column.startswith('hp__')] + + hyperparameters_dict = {} + for dataset in self.datasets_list: + tmp_dict = self[self.dataset == dataset][hyperparameters].to_dict() + hyperparameters_dict[dataset] = {} + + for key, value in iteritems(tmp_dict): + hyperparameters_dict[dataset][key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(value.values()) + + return hyperparameters_dict + + def results_optimizing_metric(self, metric_to_optimize='cv_mean__valid__zero_one_loss', minimize=True, tie_breaking_functions_ordered_dict=None): + function = min if minimize else max + + # We extract all the rows that have the best value for the metric to optimize. + optimal_results = self[self.groupby('dataset', sort=False)[metric_to_optimize].transform(function) == self[metric_to_optimize]] + + # We tie the breaks by applying the tie breaking functions (in the order of the dictionary). If hyperparameters are missing, we simply + # use the median for each hyperparameter, in a fixed (reproduceable) order. + if tie_breaking_functions_ordered_dict is None: + tie_breaking_functions_ordered_dict = OrderedDict() + else: + # Avoid side effects and ensures that the dictionary is an OrderedDict before we add missing hyperparameters. + tie_breaking_functions_ordered_dict = OrderedDict(tie_breaking_functions_ordered_dict.copy()) + + for hyperparameter in sorted(self.hyperparameters_list): + if hyperparameter not in tie_breaking_functions_ordered_dict.keys(): + tie_breaking_functions_ordered_dict[hyperparameter] = np.median + + for hyperparameter, tie_breaking_function in iteritems(tie_breaking_functions_ordered_dict): + optimal_results = optimal_results[optimal_results.groupby('dataset')['hp__' + hyperparameter].transform(partial(get_optimal_value_in_list, tie_breaking_function)) == optimal_results['hp__' + hyperparameter]] + + return ResultsDataFrame(optimal_results) + + def get_dataframe_with_metrics_as_one_column(self, metrics_to_keep=None): + new_dataframe = ResultsDataFrame() + + if metrics_to_keep is None: + metrics_to_keep = self.metrics_list + + for metric in metrics_to_keep: + columns = self.hyperparameters_list_with_prefix + [metric] + if 'dataset' in self: + columns.append('dataset') + + tmp = self.loc[:, columns] + tmp.columns = [c if c != metric else 'value' for c in tmp.columns] + tmp.loc[:, 'metric'] = metric + new_dataframe = new_dataframe.append(tmp, ignore_index=True) + + return new_dataframe + + +def get_optimal_value_in_list(optimum_function, values_list): + """Given a list of values and an optimal value, returns the value from the list that is the closest to the optimum, + given by optimum_function applied to the same list. + + >>> get_optimal_value_in_list(np.median, [2, 4, 5, 6]) + 4 + + """ + values_list = sorted(list(values_list)) + return values_list[np.argmin(np.array([scipy.spatial.distance.euclidean(value, optimum_function(values_list)) for value in values_list]))] diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv2.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv2.py new file mode 100644 index 0000000000000000000000000000000000000000..d2b2b0d8cb1be0dc3e4f86c2a36a87451d452181 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv2.py @@ -0,0 +1,1103 @@ +import scipy +import logging +from future.utils import iteritems +from copy import deepcopy +import numpy.ma as ma +from collections import defaultdict, OrderedDict +import pandas as pd +import sys +from functools import partial +import numpy as np +from scipy.spatial import distance +from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from sklearn.utils.validation import check_is_fitted +from sklearn.preprocessing import LabelEncoder +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics.pairwise import rbf_kernel, linear_kernel +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.model_selection import RandomizedSearchCV +from sklearn.tree import DecisionTreeClassifier +from scipy.stats import randint +import numpy as np + + +class ColumnGenerationClassifierv2(BaseEstimator, ClassifierMixin): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None): + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + + def fit(self, X, y): + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=10, self_complemented=True) + + y[y == 0] = -1 + + self.estimators_generator.fit(X, y) + self.classification_matrix = self._binary_classification_matrix(X) + + + self.weights_ = [] + self.infos_per_iteration_ = defaultdict(list) + + m, n = self.classification_matrix.shape + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), self.classification_matrix) + + # Initialization + + w = None + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + alpha = self._initialize_alphas(m) + + self.chosen_columns_ = [] + self.edge_scores = [] + self.alphas = [] + + self.n_total_hypotheses_ = n + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + # Find worst weak hypothesis given alpha. + h_values = ma.array(np.squeeze(np.array((alpha).T.dot(y_kernel_matrix).T)), fill_value=-np.inf) + h_values[self.chosen_columns_] = ma.masked + worst_h_index = ma.argmax(h_values) + #logging.info("Adding voter {} to the columns, value = {}".format(worst_h_index, h_values[worst_h_index])) + + # Check for optimal solution. We ensure at least one complete iteration is done as the initialization + # values might provide a degenerate initial solution. + # print(h_values[worst_h_index] - self.dual_constraint_rhs) + if h_values[worst_h_index] <= self.dual_constraint_rhs + self.epsilon and len(self.chosen_columns_) > 0: + break + + # Append the weak hypothesis. + self.chosen_columns_.append(worst_h_index) + self.edge_scores.append(h_values[worst_h_index]) + + if w is not None: + self.matrix_to_optimize = np.concatenate((np.matmul(self.matrix_to_optimize, w).reshape((m, 1)), + y_kernel_matrix[:, self.chosen_columns_[-1]].reshape((m, 1))), + axis=1) + else: + self.matrix_to_optimize = y_kernel_matrix[:, self.chosen_columns_[-1]].reshape((m, 1)) + + # Solve restricted master for new costs. + w, alpha = self._restricted_master_problem(self.matrix_to_optimize, previous_w=w, previous_alpha=alpha) + + self.alphas.append(alpha) + + + # We collect iteration information for later evaluation. + self.weights_.append(w[-1]) + + if self.save_iteration_as_hyperparameter_each is not None: + if (k + 1) % self.save_iteration_as_hyperparameter_each == 0: + self.collected_weight_vectors_[k] = deepcopy(w) + self.collected_dual_constraint_violations_[k] = h_values[worst_h_index] - self.dual_constraint_rhs + + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + + self.learner_info_ = {} + self.learner_info_.update(n_nonzero_weights=np.sum(np.asarray(self.weights_) > 1e-12)) + self.learner_info_.update(n_generated_columns=len(self.chosen_columns_)) + y[y == -1] = 0 + + return self + + def predict(self, X): + check_is_fitted(self, 'weights_') + + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + + classification_matrix = self._binary_classification_matrix(X) + self.weights_ = np.array(self.weights_) + # final_vote_weights = np.array(self.weights_) / np.sum(np.array(self.weights_)) + self.final_vote_weights = np.array([np.prod(1-self.weights_[t+1:])*self.weights_[t] if t < self.weights_.shape[0]-1 else self.weights_[t] for t in range(self.weights_.shape[0]) ]) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, self.final_vote_weights))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1] = 0 + return signs_array + + def _binary_classification_matrix(self, X): + probas = self._collect_probas(X) + predicted_labels = np.argmax(probas, axis=2) + predicted_labels[predicted_labels == 0] = -1 + values = np.max(probas, axis=2) + return (predicted_labels * values).T + + def _collect_probas(self, X): + return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) + + def _restricted_master_problem(self, y_kernel_matrix): + raise NotImplementedError("Restricted master problem not implemented.") + + def _initialize_alphas(self, n_examples): + raise NotImplementedError("Alpha weights initialization function is not implemented.") + + def evaluate_metrics(self, X, y, metrics_list=None, functions_list=None): + if metrics_list is None: + metrics_list = [zero_one_loss, zero_one_loss_per_example] + + if functions_list is None: + functions_list = [] + + # Predict, evaluate metrics. + classification_matrix = self._binary_classification_matrix(X) + predictions = sign(classification_matrix.dot(self.weights_)) + + if self.save_iteration_as_hyperparameter_each is None: + metrics_results = {} + for metric in metrics_list: + metrics_results[metric.__name__] = metric(y, predictions) + + metrics_dataframe = ResultsDataFrame([metrics_results]) + return metrics_dataframe + + # If we collected iteration informations to add a hyperparameter, we add an index with the hyperparameter name + # and return a ResultsDataFrame containing one row per hyperparameter value. + metrics_dataframe = ResultsDataFrame() + for t, weights in iteritems(self.collected_weight_vectors_): + predictions = sign(classification_matrix[:, :t + 1].dot(weights)) + metrics_results = {metric.__name__: metric(y, predictions) for metric in metrics_list} + for function in functions_list: + metrics_results[function.__name__] = function(classification_matrix[:, :t + 1], y, weights) + + # We add other collected information. + metrics_results['chosen_columns'] = self.chosen_columns_[t] + metrics_results['dual_constraint_violation'] = self.collected_dual_constraint_violations_[t] + + metrics_dataframe = metrics_dataframe.append(ResultsDataFrame([metrics_results], index=[t])) + + metrics_dataframe.index.name = 'hp__n_iterations' + return metrics_dataframe + +class CqBoostClassifierv2(ColumnGenerationClassifierv2): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None): + super(CqBoostClassifierv2, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each) + # TODO: Vérifier la valeur de nu (dual_constraint_rhs) à l'initialisation, mais de toute manière ignorée car + # on ne peut pas quitter la boucle principale avec seulement un votant. + self.mu = mu + + def _restricted_master_problem(self, y_kernel_matrix, previous_w=None, previous_alpha=None): + n_examples, n_hypotheses = y_kernel_matrix.shape + + m_eye = np.eye(n_examples) + m_ones = np.ones((n_examples, 1)) + + qp_a = np.vstack((np.hstack((-y_kernel_matrix, m_eye)), + np.hstack((np.ones((1, n_hypotheses)), np.zeros((1, n_examples)))))) + + qp_b = np.vstack((np.zeros((n_examples, 1)), + np.array([1.0]).reshape((1, 1)))) + + qp_g = np.vstack((np.hstack((-np.eye(n_hypotheses), np.zeros((n_hypotheses, n_examples)))), + np.hstack((np.zeros((1, n_hypotheses)), - 1.0 / n_examples * m_ones.T)))) + + qp_h = np.vstack((np.zeros((n_hypotheses, 1)), + np.array([-self.mu]).reshape((1, 1)))) + + qp = ConvexProgram() + qp.quadratic_func = 2.0 / n_examples * np.vstack((np.hstack((np.zeros((n_hypotheses, n_hypotheses)), np.zeros((n_hypotheses, n_examples)))), + np.hstack((np.zeros((n_examples, n_hypotheses)), m_eye)))) + + qp.add_equality_constraints(qp_a, qp_b) + qp.add_inequality_constraints(qp_g, qp_h) + + if previous_w is not None: + qp.initial_values = np.append(previous_w, [0]) + + try: + solver_result = qp.solve(abstol=1e-10, reltol=1e-10, feastol=1e-10, return_all_information=True) + w = np.asarray(np.array(solver_result['x']).T[0])[:n_hypotheses] + + # The alphas are the Lagrange multipliers associated with the equality constraints (returned as the y vector in CVXOPT). + dual_variables = np.asarray(np.array(solver_result['y']).T[0]) + alpha = dual_variables[:n_examples] + + # Set the dual constraint right-hand side to be equal to the last lagrange multiplier (nu). + # Hack: do not change nu if the QP didn't fully solve... + if solver_result['dual slack'] <= 1e-8: + self.dual_constraint_rhs = dual_variables[-1] + # logging.info('Updating dual constraint rhs: {}'.format(self.dual_constraint_rhs)) + + except: + logging.warning('QP Solving failed at iteration {}.'.format(n_hypotheses)) + if previous_w is not None: + w = np.append(previous_w, [0]) + else: + w = np.array([1.0 / n_hypotheses] * n_hypotheses) + + if previous_alpha is not None: + alpha = previous_alpha + else: + alpha = self._initialize_alphas(n_examples) + + return w, alpha + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class CQBoostv2(CqBoostClassifierv2): + + def __init__(self, random_state, **kwargs): + super(CQBoostv2, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + ) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', CqBoostClassifierv2())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- CQBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- CQBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = CqBoostClassifierv2(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"],) + # random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.choice([0.001, 0.002]), + "epsilon": randomState.choice([1e-08, 2e-08]), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.CQB2_mu + kwargsDict['epsilon'] = args.CQB2_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', CqBoostClassifierv2())]) + + +def genParamsDict(randomState): + return {"classifier__mu": [0.001, 0.002], + "classifier__epsilon": [1e-08, 2e-08], + "classifier__n_max_iterations": [None]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- CQBoostv2 with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- CQBoostv2 with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + interpretString = "\t CQBoost v2 permformed classification with weights : \n" + interpretString += np.array2string(classifier.final_vote_weights, precision=4, separator=',', suppress_small=True) + interpretString += "\n \t It used {} iterations to converge, ".format(len(classifier.final_vote_weights)) + if len(classifier.final_vote_weights) == classifier.n_max_iterations: + interpretString += ", and used all available iterations, " + else: + interpretString += "." + if len(classifier.final_vote_weights) == classifier.n_total_hypotheses_: + interpretString += ", and all the voters have been used." + else: + interpretString += "." + interpretString += "\n\t Selected voters : \n" + interpretString += str(classifier.chosen_columns_) + interpretString += "\n\t and they voted : \n" + interpretString += np.array2string(classifier.classification_matrix[:,classifier.chosen_columns_], precision=4, + separator=',', suppress_small=True) + np.savetxt(directory + "voters.csv", classifier.classification_matrix[:,classifier.chosen_columns_], delimiter=',') + np.savetxt(directory + "weights.csv", classifier.final_vote_weights, delimiter=',') + return interpretString + + + + + +def _as_matrix(element): + """ Utility function to convert "anything" to a Numpy matrix. + """ + # If a scalar, return a 1x1 matrix. + if len(np.shape(element)) == 0: + return np.matrix([[element]], dtype=float) + + # If a nd-array vector, return a column matrix. + elif len(np.shape(element)) == 1: + matrix = np.matrix(element, dtype=float) + if np.shape(matrix)[1] != 1: + matrix = matrix.T + return matrix + + return np.matrix(element, dtype=float) + + +def _as_column_matrix(array_like): + """ Utility function to convert any array to a column Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[0] == 1: + matrix = matrix.T + + return matrix + + +def _as_line_matrix(array_like): + """ Utility function to convert any array to a line Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[1] == 1: + matrix = matrix.T + + return matrix + + +class ConvexProgram(object): + """ + Encapsulates a quadratic program of the following form: + + minimize (1/2)*x'*P*x + q'*x + subject to G*x <= h + A*x = b. + + + or a linear program of the following form: + + minimize c'*x + subject to G*x <= h + A*x = b + """ + def __init__(self): + self._quadratic_func = None + self._linear_func = None + self._inequality_constraints_matrix = None + self._inequality_constraints_values = None + self._equality_constraints_matrix = None + self._equality_constraints_values = None + self._lower_bound_values = None + self._upper_bound_values = None + self._n_variables = None + + @property + def n_variables(self): + return self._n_variables + + @property + def quadratic_func(self): + return self._quadratic_func + + @quadratic_func.setter + def quadratic_func(self, quad_matrix): + quad_matrix = _as_matrix(quad_matrix) + n_lines, n_columns = np.shape(quad_matrix) + assert(n_lines == n_columns) + + if self._linear_func is not None: + assert(np.shape(quad_matrix)[0] == self._n_variables) + else: + self._n_variables = n_lines + + self._quadratic_func = quad_matrix + + @property + def linear_func(self): + return self._linear_func + + @linear_func.setter + def linear_func(self, lin_vector): + if lin_vector is not None: + lin_vector = _as_column_matrix(lin_vector) + + if self._quadratic_func is not None: + assert(np.shape(lin_vector)[0] == self._n_variables) + + else: + self._n_variables = np.shape(lin_vector)[0] + + self._linear_func = lin_vector + + def add_inequality_constraints(self, inequality_matrix, inequality_values): + if inequality_matrix is None: + logging.info("Empty inequality constraint: ignoring!") + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(inequality_matrix) or len(np.shape(inequality_matrix)) == 1: + inequality_matrix = _as_line_matrix(inequality_matrix) + else: + inequality_matrix = _as_matrix(inequality_matrix) + + inequality_values = _as_column_matrix(inequality_values) + assert np.shape(inequality_matrix)[1] == self._n_variables + assert np.shape(inequality_values)[1] == 1 + + if self._inequality_constraints_matrix is None: + self._inequality_constraints_matrix = inequality_matrix + else: + self._inequality_constraints_matrix = np.append(self._inequality_constraints_matrix, + inequality_matrix, axis=0) + + if self._inequality_constraints_values is None: + self._inequality_constraints_values = inequality_values + else: + self._inequality_constraints_values = np.append(self._inequality_constraints_values, + inequality_values, axis=0) + + def add_equality_constraints(self, equality_matrix, equality_values): + if equality_matrix is None: + logging.info("Empty equality constraint: ignoring!") + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(equality_matrix) or len(np.shape(equality_matrix)) == 1: + equality_matrix = _as_line_matrix(equality_matrix) + else: + equality_matrix = _as_matrix(equality_matrix) + + equality_values = _as_matrix(equality_values) + assert np.shape(equality_matrix)[1] == self._n_variables + assert np.shape(equality_values)[1] == 1 + + if self._equality_constraints_matrix is None: + self._equality_constraints_matrix = equality_matrix + else: + self._equality_constraints_matrix = np.append(self._equality_constraints_matrix, + equality_matrix, axis=0) + + if self._equality_constraints_values is None: + self._equality_constraints_values = equality_values + else: + self._equality_constraints_values = np.append(self._equality_constraints_values, + equality_values, axis=0) + + def add_lower_bound(self, lower_bound): + if lower_bound is not None: + self._assert_objective_function_is_set() + self._lower_bound_values = np.array([lower_bound] * self._n_variables) + + def add_upper_bound(self, upper_bound): + if upper_bound is not None: + self._assert_objective_function_is_set() + self._upper_bound_values = np.array([upper_bound] * self._n_variables) + + def _convert_bounds_to_inequality_constraints(self): + self._assert_objective_function_is_set() + + if self._lower_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = -1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._lower_bound_values) + self._lower_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + if self._upper_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = 1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._upper_bound_values) + self._upper_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + def _convert_to_cvxopt_matrices(self): + from cvxopt import matrix as cvxopt_matrix + + if self._quadratic_func is not None: + self._quadratic_func = cvxopt_matrix(self._quadratic_func) + + if self._linear_func is not None: + self._linear_func = cvxopt_matrix(self._linear_func) + else: + # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! + self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) + + if self._inequality_constraints_matrix is not None: + self._inequality_constraints_matrix = cvxopt_matrix(self._inequality_constraints_matrix) + + if self._inequality_constraints_values is not None: + self._inequality_constraints_values = cvxopt_matrix(self._inequality_constraints_values) + + if self._equality_constraints_matrix is not None: + self._equality_constraints_matrix = cvxopt_matrix(self._equality_constraints_matrix) + + if self._equality_constraints_values is not None: + self._equality_constraints_values = cvxopt_matrix(self._equality_constraints_values) + + def _assert_objective_function_is_set(self): + assert self._n_variables is not None + + def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, return_all_information=False): + + # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. + original_stdout = sys.stdout + original_stderr = sys.stderr + + ret = None + + # TODO: Repair + # if solver == "cvxopt": + # stdout_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stdout_logger, logging.DEBUG) + # sys.stdout = sl + + # stderr_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stderr_logger, logging.WARNING) + # sys.stderr = sl + + try: + if solver == "cvxopt": + from cvxopt.solvers import qp, lp, options + options['feastol'] = feastol + options['abstol'] = abstol + options['reltol'] = reltol + options['show_progress'] = False + + self._convert_bounds_to_inequality_constraints() + self._convert_to_cvxopt_matrices() + + if self._quadratic_func is not None: + ret = qp(self.quadratic_func, self.linear_func, self._inequality_constraints_matrix, + self._inequality_constraints_values, self._equality_constraints_matrix, + self._equality_constraints_values) + + else: + ret = lp(self.linear_func, + G=self._inequality_constraints_matrix, + h=self._inequality_constraints_values, + A=self._equality_constraints_matrix, + b=self._equality_constraints_values) + + #logging.info("Primal objective value = {}".format(ret['primal objective'])) + #logging.info("Dual objective value = {}".format(ret['dual objective'])) + + if not return_all_information: + ret = np.asarray(np.array(ret['x']).T[0]) + + elif solver == "cplex": + import cplex + p = cplex.Cplex() + p.objective.set_sense(p.objective.sense.minimize) + + # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP + # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of + # them...), and the values (columns of the QP matrix). + names = [str(x) for x in range(self._n_variables)] + p.variables.add(names=names) + + if self.quadratic_func is not None: + p_matrix = [] + for line in self._quadratic_func: + p_matrix.append([names, line.tolist()[0]]) + + p.objective.set_quadratic(p_matrix) + + if self.linear_func is not None: + p.objective.set_linear(zip(names, + np.asarray(self.linear_func.T).reshape(self.n_variables,).tolist())) + + if self._inequality_constraints_matrix is not None: + inequality_linear = [] + for line in self._inequality_constraints_matrix: + inequality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=inequality_linear, + rhs=np.asarray(self._inequality_constraints_values.T).tolist()[0], + senses="L"*len(self._inequality_constraints_values)) + + if self._equality_constraints_matrix is not None: + equality_linear = [] + for line in self._equality_constraints_matrix: + equality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=equality_linear, + rhs=np.asarray(self._equality_constraints_values.T).tolist()[0], + senses="E"*len(self._equality_constraints_values)) + + if self._lower_bound_values is not None: + p.variables.set_lower_bounds(zip(names, self._lower_bound_values)) + + if self._upper_bound_values is not None: + p.variables.set_upper_bounds(zip(names, self._upper_bound_values)) + + p.solve() + + logging.info("Solution status = {} : {}".format(p.solution.get_status(), + p.solution.status[p.solution.get_status()])) + logging.info("Solution value = {}".format(p.solution.get_objective_value())) + + if not return_all_information: + ret = np.array(p.solution.get_values()) + else: + ret = {'primal': np.array(p.solution.get_values()), + 'dual': np.array(p.solution.get_dual_values())} + + elif solver == "pycpx": + # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). + + import pycpx + model = pycpx.CPlexModel(verbosity=2) + q = model.new(self.n_variables) + + if self._inequality_constraints_matrix is not None: + model.constrain(self._inequality_constraints_matrix * q <= self._inequality_constraints_values) + if self._equality_constraints_matrix is not None: + model.constrain(self._equality_constraints_matrix * q == self._equality_constraints_values) + if self._lower_bound_values is not None: + model.constrain(q >= self._lower_bound_values) + if self._upper_bound_values is not None: + model.constrain(q <= self._upper_bound_values) + + value = model.minimize(0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) + + logging.info("Solution value = {}".format(value)) + + if not return_all_information: + ret = np.array(model[q]) + else: + ret = model + + except: + raise + + finally: + sys.stdout = original_stdout + sys.stderr = original_stderr + + return ret + + + + + + +class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): + """Generic Attribute Threshold Binary Classifier + + Attributes + ---------- + attribute_index : int + The attribute to consider for the classification. + threshold : float + The threshold value for classification rule. + direction : int, optional + A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump + will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). + + """ + def __init__(self, attribute_index, threshold, direction=1): + super(DecisionStumpClassifier, self).__init__() + self.attribute_index = attribute_index + self.threshold = threshold + self.direction = direction + + def fit(self, X, y): + # Only verify that we are in the binary classification setting, with support for transductive learning. + if isinstance(y, np.ma.MaskedArray): + self.classes_ = np.unique(y[np.logical_not(y.mask)]) + else: + self.classes_ = np.unique(y) + + # This label encoder is there for the predict function to be able to return any two classes that were used + # when fitting, for example {-1, 1} or {0, 1}. + self.le_ = LabelEncoder() + self.le_.fit(self.classes_) + self.classes_ = self.le_.classes_ + + assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" + return self + + def predict(self, X): + """Returns the output of the classifier, on a sample X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + predictions : array-like, shape = [n_samples] + Predicted class labels. + + """ + check_is_fitted(self, 'classes_') + return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) + + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + avg : array-like, shape = [n_samples, n_classes] + Weighted average probability for each class per sample. + + """ + check_is_fitted(self, 'classes_') + X = np.asarray(X) + probas = np.zeros((X.shape[0], 2)) + positive_class = np.argwhere(X[:, self.attribute_index] > self.threshold) + negative_class = np.setdiff1d(range(X.shape[0]), positive_class) + probas[positive_class, 1] = 1.0 + probas[negative_class, 0] = 1.0 + + if self.direction == -1: + probas = 1 - probas + + return probas + + def reverse_decision(self): + self.direction *= -1 + + +class ClassifiersGenerator(BaseEstimator, TransformerMixin): + """Base class to create a set of voters using training samples, and then transform a set of examples in + the voters' output space. + + Attributes + ---------- + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + voters : ndarray of voter functions + Once fit, contains the voter functions. + + """ + def __init__(self, self_complemented=False): + super(ClassifiersGenerator, self).__init__() + self.self_complemented = self_complemented + + def fit(self, X, y=None): + """Generates the voters using training samples. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Input labels, usually determines the decision polarity of each voter. + + Returns + ------- + self + + """ + raise NotImplementedError + + def transform(self, X): + """Transforms the input points in a matrix of classification, using previously learned voters. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data to classify. + + Returns + ------- + ndarray of shape (n_samples, n_voters) + The voters' decision on each example. + + """ + check_is_fitted(self, 'estimators_') + return np.array([voter.predict(X) for voter in self.estimators_]).T + +class StumpsClassifiersGenerator(ClassifiersGenerator): + """Decision Stump Voters transformer. + + Parameters + ---------- + n_stumps_per_attribute : int, optional + Determines how many decision stumps will be created for each attribute. Defaults to 10. + No stumps will be created for attributes with only one possible value. + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + + """ + def __init__(self, n_stumps_per_attribute=10, self_complemented=False): + super(StumpsClassifiersGenerator, self).__init__(self_complemented) + self.n_stumps_per_attribute = n_stumps_per_attribute + + def fit(self, X, y): + """Fits Decision Stump voters on a training set. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Only used to ensure that we are in the binary classification setting. + + Returns + ------- + self + + """ + minimums = np.min(X, axis=0) + maximums = np.max(X, axis=0) + ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) + + self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, 1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + if self.self_complemented: + self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + self.estimators_ = np.asarray(self.estimators_) + return self + +def sign(array): + """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. + Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. + + Parameters + ---------- + array : array-like + Input values. + + Returns + ------- + ndarray + An array with the signs of input elements. + + """ + signs = np.sign(array) + + signs[array == 0] = -1 + return signs + + +def zero_one_loss(y_target, y_estimate, confidences=1): + if len(y_target) == 0: + return 0.0 + return np.mean(y_target != y_estimate) + + +def zero_one_loss_per_example(y_target, y_estimate, confidences=1): + if len(y_target) == 0: + return 0.0 + return (y_target != y_estimate).astype(np.int) + + +class ResultsDataFrame(pd.DataFrame): + """A ResultsDataFrame is a DataFrame with the following information: + + - A 'dataset' column that contains the dataset name + - Hyperparamer columns, named 'hp__HPNAME', where HPNAME is the name of the hyperparameter + - Columns containing informations about that depend on the dataset and hyperparameters, for example the risk. + + """ + @property + def datasets_list(self): + """Returns the sorted list of datasets. + + """ + return sorted(set(self['dataset'])) + + @property + def hyperparameters_list(self): + """Returns a sorted list of hyperparameter names, without the 'hp__' prefix. + + """ + return sorted(column.split('hp__')[1] for column in self.columns if column.startswith('hp__')) + + @property + def hyperparameters_list_with_prefix(self): + return sorted(column for column in self.columns if column.startswith('hp__')) + + @property + def metrics_list(self): + return sorted(column for column in self.columns if not column.startswith('hp__') and column != 'dataset') + + @property + def hyperparameters_with_values(self): + """Returns a dictionary that contains the hyperparameter names (without the 'hp__' prefix), and + associated values that are present in the DataFrame. + + """ + hyperparameters = [column for column in self.columns if column.startswith('hp__')] + + hyperparameters_dict = {} + tmp_dict = self[hyperparameters].to_dict() + + for key, value in iteritems(tmp_dict): + hyperparameters_dict[key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(set(value.values())) + + return hyperparameters_dict + + @property + def hyperparameters_with_values_per_dataset(self): + """Returns a dictionary of dictionaries that contains for each dataset, the hyperparameter names (without the + 'hp__' prefix), and associated values that are present in the DataFrame. + + """ + hyperparameters = [column for column in self.columns if column.startswith('hp__')] + + hyperparameters_dict = {} + for dataset in self.datasets_list: + tmp_dict = self[self.dataset == dataset][hyperparameters].to_dict() + hyperparameters_dict[dataset] = {} + + for key, value in iteritems(tmp_dict): + hyperparameters_dict[dataset][key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(value.values()) + + return hyperparameters_dict + + def results_optimizing_metric(self, metric_to_optimize='cv_mean__valid__zero_one_loss', minimize=True, tie_breaking_functions_ordered_dict=None): + function = min if minimize else max + + # We extract all the rows that have the best value for the metric to optimize. + optimal_results = self[self.groupby('dataset', sort=False)[metric_to_optimize].transform(function) == self[metric_to_optimize]] + + # We tie the breaks by applying the tie breaking functions (in the order of the dictionary). If hyperparameters are missing, we simply + # use the median for each hyperparameter, in a fixed (reproduceable) order. + if tie_breaking_functions_ordered_dict is None: + tie_breaking_functions_ordered_dict = OrderedDict() + else: + # Avoid side effects and ensures that the dictionary is an OrderedDict before we add missing hyperparameters. + tie_breaking_functions_ordered_dict = OrderedDict(tie_breaking_functions_ordered_dict.copy()) + + for hyperparameter in sorted(self.hyperparameters_list): + if hyperparameter not in tie_breaking_functions_ordered_dict.keys(): + tie_breaking_functions_ordered_dict[hyperparameter] = np.median + + for hyperparameter, tie_breaking_function in iteritems(tie_breaking_functions_ordered_dict): + optimal_results = optimal_results[optimal_results.groupby('dataset')['hp__' + hyperparameter].transform(partial(get_optimal_value_in_list, tie_breaking_function)) == optimal_results['hp__' + hyperparameter]] + + return ResultsDataFrame(optimal_results) + + def get_dataframe_with_metrics_as_one_column(self, metrics_to_keep=None): + new_dataframe = ResultsDataFrame() + + if metrics_to_keep is None: + metrics_to_keep = self.metrics_list + + for metric in metrics_to_keep: + columns = self.hyperparameters_list_with_prefix + [metric] + if 'dataset' in self: + columns.append('dataset') + + tmp = self.loc[:, columns] + tmp.columns = [c if c != metric else 'value' for c in tmp.columns] + tmp.loc[:, 'metric'] = metric + new_dataframe = new_dataframe.append(tmp, ignore_index=True) + + return new_dataframe + + +def get_optimal_value_in_list(optimum_function, values_list): + """Given a list of values and an optimal value, returns the value from the list that is the closest to the optimum, + given by optimum_function applied to the same list. + + >>> get_optimal_value_in_list(np.median, [2, 4, 5, 6]) + 4 + + """ + values_list = sorted(list(values_list)) + return values_list[np.argmin(np.array([scipy.spatial.distance.euclidean(value, optimum_function(values_list)) for value in values_list]))] diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv21.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv21.py new file mode 100644 index 0000000000000000000000000000000000000000..a6f9ab442e3a1f809acf091c6e3f3bcf9340281e --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostv21.py @@ -0,0 +1,1073 @@ +import scipy +import logging +from future.utils import iteritems +from copy import deepcopy +import numpy.ma as ma +from collections import defaultdict, OrderedDict +import pandas as pd +import sys +from functools import partial +import numpy as np +from scipy.spatial import distance +from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin +from sklearn.utils.validation import check_is_fitted +from sklearn.preprocessing import LabelEncoder +from sklearn.tree import DecisionTreeClassifier +from sklearn.metrics.pairwise import rbf_kernel, linear_kernel +import numpy as np +from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.pipeline import Pipeline +from sklearn.model_selection import RandomizedSearchCV +from sklearn.tree import DecisionTreeClassifier +from scipy.stats import randint +import numpy as np + + +class ColumnGenerationClassifierv21(BaseEstimator, ClassifierMixin): + def __init__(self, epsilon=1e-06, n_max_iterations=None, estimators_generator=None, dual_constraint_rhs=0, save_iteration_as_hyperparameter_each=None): + self.epsilon = epsilon + self.n_max_iterations = n_max_iterations + self.estimators_generator = estimators_generator + self.dual_constraint_rhs = dual_constraint_rhs + self.save_iteration_as_hyperparameter_each = save_iteration_as_hyperparameter_each + + def fit(self, X, y): + if scipy.sparse.issparse(X): + logging.info('Converting to dense matrix.') + X = np.array(X.todense()) + + if self.estimators_generator is None: + self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=10, self_complemented=True) + + self.estimators_generator.fit(X, y) + classification_matrix = self._binary_classification_matrix(X) + + self.chosen_columns_ = [] + self.weights_ = [] + self.infos_per_iteration_ = defaultdict(list) + + m, n = classification_matrix.shape + self.matrix_to_optimize = 1e-08*np.ones((m,1), dtype=float) + self.n_total_hypotheses_ = n + + y_kernel_matrix = np.multiply(y.reshape((len(y), 1)), classification_matrix) + + # Initialization + alpha = self._initialize_alphas(m) + w = np.array([1.0]) + self.collected_weight_vectors_ = {} + self.collected_dual_constraint_violations_ = {} + + for k in range(min(n, self.n_max_iterations if self.n_max_iterations is not None else np.inf)): + + # Find worst weak hypothesis given alpha. + h_values = ma.array(np.squeeze(np.array(alpha.T.dot(y_kernel_matrix).T)), fill_value=-np.inf) + h_values[self.chosen_columns_] = ma.masked + worst_h_index = ma.argmax(h_values) + #logging.info("Adding voter {} to the columns, value = {}".format(worst_h_index, h_values[worst_h_index])) + + # Check for optimal solution. We ensure at least one complete iteration is done as the initialization + # values might provide a degenerate initial solution. + if h_values[worst_h_index] <= self.dual_constraint_rhs + self.epsilon and len(self.chosen_columns_) > 0: + break + + # Append the weak hypothesis. + self.chosen_columns_.append(worst_h_index) + + self.matrix_to_optimize = np.concatenate((np.matmul(self.matrix_to_optimize, w).reshape((m,1)), + y_kernel_matrix[:,self.chosen_columns_[-1]].reshape((m,1))), + axis=1) + + # Solve restricted master for new costs. + w, alpha = self._restricted_master_problem(self.matrix_to_optimize, previous_w=w, previous_alpha=alpha) + + + # We collect iteration information for later evaluation. + self.weights_.append(w[1]) + + if self.save_iteration_as_hyperparameter_each is not None: + if (k + 1) % self.save_iteration_as_hyperparameter_each == 0: + self.collected_weight_vectors_[k] = deepcopy(w) + self.collected_dual_constraint_violations_[k] = h_values[worst_h_index] - self.dual_constraint_rhs + + + self.estimators_generator.estimators_ = self.estimators_generator.estimators_[self.chosen_columns_] + + self.learner_info_ = {} + self.learner_info_.update(n_nonzero_weights=np.sum(np.asarray(self.weights_) > 1e-12)) + self.learner_info_.update(n_generated_columns=len(self.chosen_columns_)) + + return self + + def predict(self, X): + check_is_fitted(self, 'weights_') + + if scipy.sparse.issparse(X): + logging.warning('Converting sparse matrix to dense matrix.') + X = np.array(X.todense()) + + classification_matrix = self._binary_classification_matrix(X) + self.weights_ = np.array(self.weights_) + final_vote_weights = np.array(self.weights_) / np.sum(np.array(self.weights_)) + # final_vote_weights = np.array([np.prod(1-self.weights_[t+1:])*self.weights_[t] if t < self.weights_.shape[0]-1 else self.weights_[t] for t in range(self.weights_.shape[0]) ]) + margins = np.squeeze(np.asarray(np.matmul(classification_matrix, final_vote_weights))) + signs_array = np.array([int(x) for x in sign(margins)]) + signs_array[signs_array == -1 ] = 0 + return signs_array + + def _binary_classification_matrix(self, X): + probas = self._collect_probas(X) + predicted_labels = np.argmax(probas, axis=2) + predicted_labels[predicted_labels == 0] = -1 + values = np.max(probas, axis=2) + return (predicted_labels * values).T + + def _collect_probas(self, X): + return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) + + def _restricted_master_problem(self, y_kernel_matrix): + raise NotImplementedError("Restricted master problem not implemented.") + + def _initialize_alphas(self, n_examples): + raise NotImplementedError("Alpha weights initialization function is not implemented.") + + def evaluate_metrics(self, X, y, metrics_list=None, functions_list=None): + if metrics_list is None: + metrics_list = [zero_one_loss, zero_one_loss_per_example] + + if functions_list is None: + functions_list = [] + + # Predict, evaluate metrics. + classification_matrix = self._binary_classification_matrix(X) + predictions = sign(classification_matrix.dot(self.weights_)) + + if self.save_iteration_as_hyperparameter_each is None: + metrics_results = {} + for metric in metrics_list: + metrics_results[metric.__name__] = metric(y, predictions) + + metrics_dataframe = ResultsDataFrame([metrics_results]) + return metrics_dataframe + + # If we collected iteration informations to add a hyperparameter, we add an index with the hyperparameter name + # and return a ResultsDataFrame containing one row per hyperparameter value. + metrics_dataframe = ResultsDataFrame() + for t, weights in iteritems(self.collected_weight_vectors_): + predictions = sign(classification_matrix[:, :t + 1].dot(weights)) + metrics_results = {metric.__name__: metric(y, predictions) for metric in metrics_list} + for function in functions_list: + metrics_results[function.__name__] = function(classification_matrix[:, :t + 1], y, weights) + + # We add other collected information. + metrics_results['chosen_columns'] = self.chosen_columns_[t] + metrics_results['dual_constraint_violation'] = self.collected_dual_constraint_violations_[t] + + metrics_dataframe = metrics_dataframe.append(ResultsDataFrame([metrics_results], index=[t])) + + metrics_dataframe.index.name = 'hp__n_iterations' + return metrics_dataframe + +class CqBoostClassifierv21(ColumnGenerationClassifierv21): + def __init__(self, mu=0.001, epsilon=1e-08, n_max_iterations=None, estimators_generator=None, save_iteration_as_hyperparameter_each=None): + super(CqBoostClassifierv21, self).__init__(epsilon, n_max_iterations, estimators_generator, dual_constraint_rhs=0, + save_iteration_as_hyperparameter_each=save_iteration_as_hyperparameter_each) + # TODO: Vérifier la valeur de nu (dual_constraint_rhs) à l'initialisation, mais de toute manière ignorée car + # on ne peut pas quitter la boucle principale avec seulement un votant. + self.mu = mu + + def _restricted_master_problem(self, y_kernel_matrix, previous_w=None, previous_alpha=None): + n_examples, n_hypotheses = y_kernel_matrix.shape + + m_eye = np.eye(n_examples) + m_ones = np.ones((n_examples, 1)) + + qp_a = np.vstack((np.hstack((-y_kernel_matrix, m_eye)), + np.hstack((np.ones((1, n_hypotheses)), np.zeros((1, n_examples)))))) + + qp_b = np.vstack((np.zeros((n_examples, 1)), + np.array([1.0]).reshape((1, 1)))) + + qp_g = np.vstack((np.hstack((-np.eye(n_hypotheses), np.zeros((n_hypotheses, n_examples)))), + np.hstack((np.zeros((1, n_hypotheses)), - 1.0 / n_examples * m_ones.T)))) + + qp_h = np.vstack((np.zeros((n_hypotheses, 1)), + np.array([-self.mu]).reshape((1, 1)))) + + qp = ConvexProgram() + qp.quadratic_func = 2.0 / n_examples * np.vstack((np.hstack((np.zeros((n_hypotheses, n_hypotheses)), np.zeros((n_hypotheses, n_examples)))), + np.hstack((np.zeros((n_examples, n_hypotheses)), m_eye)))) + + qp.add_equality_constraints(qp_a, qp_b) + qp.add_inequality_constraints(qp_g, qp_h) + + if previous_w is not None: + qp.initial_values = np.append(previous_w, [0]) + + try: + solver_result = qp.solve(abstol=1e-10, reltol=1e-10, feastol=1e-10, return_all_information=True) + w = np.asarray(np.array(solver_result['x']).T[0])[:n_hypotheses] + + # The alphas are the Lagrange multipliers associated with the equality constraints (returned as the y vector in CVXOPT). + dual_variables = np.asarray(np.array(solver_result['y']).T[0]) + alpha = dual_variables[:n_examples] + + # Set the dual constraint right-hand side to be equal to the last lagrange multiplier (nu). + # Hack: do not change nu if the QP didn't fully solve... + if solver_result['dual slack'] <= 1e-8: + self.dual_constraint_rhs = dual_variables[-1] + # logging.info('Updating dual constraint rhs: {}'.format(self.dual_constraint_rhs)) + + except: + logging.warning('QP Solving failed at iteration {}.'.format(n_hypotheses)) + if previous_w is not None: + w = np.append(previous_w, [0]) + else: + w = np.array([1.0 / n_hypotheses] * n_hypotheses) + + if previous_alpha is not None: + alpha = previous_alpha + else: + alpha = self._initialize_alphas(n_examples) + + return w, alpha + + def _initialize_alphas(self, n_examples): + return 1.0 / n_examples * np.ones((n_examples,)) + + +class CQBoostv21(CqBoostClassifierv21): + + def __init__(self, random_state, **kwargs): + super(CQBoostv21, self).__init__( + mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations= kwargs['n_max_iterations'], + ) + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def paramsToSrt(self, nIter=1): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 0.001, + "epsilon": 1e-08, + "n_max_iterations": None}) + return paramsSet + + def getKWARGS(self, args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = 0.001 + kwargsDict['epsilon'] = 1e-08 + kwargsDict['n_max_iterations'] = None + return kwargsDict + + def genPipeline(self): + return Pipeline([('classifier', CqBoostClassifierv21())]) + + def genParamsDict(self, randomState): + return {"classifier__mu": [0.001], + "classifier__epsilon": [1e-08], + "classifier__n_max_iterations": [None]} + + def genBestParams(self, detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + def genParamsFromDetector(self, detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + def getConfig(self, config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- CQBoost with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon + ", n_max_iterations : " + str(config.n_max_iterations)) + else: + return "\n\t\t- CQBoost with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"] + ", n_max_iterations : " + str(config["n_max_iterations"])) + + + def getInterpret(self, classifier, directory): + interpretString = "" + return interpretString + + +def canProbas(): + return False + + +def fit(DATASET, CLASS_LABELS, randomState, NB_CORES=1, **kwargs): + """Used to fit the monoview classifier with the args stored in kwargs""" + classifier = CqBoostClassifierv21(mu=kwargs['mu'], + epsilon=kwargs['epsilon'], + n_max_iterations=kwargs["n_max_iterations"], ) + # random_state=randomState) + classifier.fit(DATASET, CLASS_LABELS) + return classifier + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": randomState.choice([0.001, 0.002]), + "epsilon": randomState.choice([1e-08, 2e-08]), + "n_max_iterations": None}) + return paramsSet + + +def getKWARGS(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {} + kwargsDict['mu'] = args.CQB21_mu + kwargsDict['epsilon'] = args.CQB21_epsilon + kwargsDict['n_max_iterations'] = None + return kwargsDict + + +def genPipeline(): + return Pipeline([('classifier', CqBoostClassifierv21())]) + + +def genParamsDict(randomState): + return {"classifier__mu": [0.001, 0.002], + "classifier__epsilon": [1e-08, 2e-08], + "classifier__n_max_iterations": [None]} + + +def genBestParams(detector): + return {"mu": detector.best_params_["classifier__mu"], + "epsilon": detector.best_params_["classifier__epsilon"], + "n_max_iterations": detector.best_params_["classifier__n_max_iterations"]} + + +def genParamsFromDetector(detector): + nIter = len(detector.cv_results_['param_classifier__mu']) + return [("mu", np.array([0.001 for _ in range(nIter)])), + ("epsilon", np.array(detector.cv_results_['param_classifier__epsilon'])), + ("n_max_iterations", np.array(detector.cv_results_['param_classifier__n_max_iterations']))] + + +def getConfig(config): + if type(config) is not dict: # Used in late fusion when config is a classifier + return "\n\t\t- CQBoostv21 with mu : " + str(config.mu) + ", epsilon : " + str( + config.epsilon) + ", n_max_iterations : " + str(config.n_max_iterations) + else: + return "\n\t\t- CQBoostv21 with mu : " + str(config["mu"]) + ", epsilon : " + str( + config["epsilon"]) + ", n_max_iterations : " + str(config["n_max_iterations"]) + + +def getInterpret(classifier, directory): + interpretString = "" + return interpretString + + + + + +def _as_matrix(element): + """ Utility function to convert "anything" to a Numpy matrix. + """ + # If a scalar, return a 1x1 matrix. + if len(np.shape(element)) == 0: + return np.matrix([[element]], dtype=float) + + # If a nd-array vector, return a column matrix. + elif len(np.shape(element)) == 1: + matrix = np.matrix(element, dtype=float) + if np.shape(matrix)[1] != 1: + matrix = matrix.T + return matrix + + return np.matrix(element, dtype=float) + + +def _as_column_matrix(array_like): + """ Utility function to convert any array to a column Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[0] == 1: + matrix = matrix.T + + return matrix + + +def _as_line_matrix(array_like): + """ Utility function to convert any array to a line Numpy matrix. + """ + matrix = _as_matrix(array_like) + if 1 not in np.shape(matrix): + raise ValueError("_as_column_vector: input must be a vector") + + if np.shape(matrix)[1] == 1: + matrix = matrix.T + + return matrix + + +class ConvexProgram(object): + """ + Encapsulates a quadratic program of the following form: + + minimize (1/2)*x'*P*x + q'*x + subject to G*x <= h + A*x = b. + + + or a linear program of the following form: + + minimize c'*x + subject to G*x <= h + A*x = b + """ + def __init__(self): + self._quadratic_func = None + self._linear_func = None + self._inequality_constraints_matrix = None + self._inequality_constraints_values = None + self._equality_constraints_matrix = None + self._equality_constraints_values = None + self._lower_bound_values = None + self._upper_bound_values = None + self._n_variables = None + + @property + def n_variables(self): + return self._n_variables + + @property + def quadratic_func(self): + return self._quadratic_func + + @quadratic_func.setter + def quadratic_func(self, quad_matrix): + quad_matrix = _as_matrix(quad_matrix) + n_lines, n_columns = np.shape(quad_matrix) + assert(n_lines == n_columns) + + if self._linear_func is not None: + assert(np.shape(quad_matrix)[0] == self._n_variables) + else: + self._n_variables = n_lines + + self._quadratic_func = quad_matrix + + @property + def linear_func(self): + return self._linear_func + + @linear_func.setter + def linear_func(self, lin_vector): + if lin_vector is not None: + lin_vector = _as_column_matrix(lin_vector) + + if self._quadratic_func is not None: + assert(np.shape(lin_vector)[0] == self._n_variables) + + else: + self._n_variables = np.shape(lin_vector)[0] + + self._linear_func = lin_vector + + def add_inequality_constraints(self, inequality_matrix, inequality_values): + if inequality_matrix is None: + logging.info("Empty inequality constraint: ignoring!") + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(inequality_matrix) or len(np.shape(inequality_matrix)) == 1: + inequality_matrix = _as_line_matrix(inequality_matrix) + else: + inequality_matrix = _as_matrix(inequality_matrix) + + inequality_values = _as_column_matrix(inequality_values) + assert np.shape(inequality_matrix)[1] == self._n_variables + assert np.shape(inequality_values)[1] == 1 + + if self._inequality_constraints_matrix is None: + self._inequality_constraints_matrix = inequality_matrix + else: + self._inequality_constraints_matrix = np.append(self._inequality_constraints_matrix, + inequality_matrix, axis=0) + + if self._inequality_constraints_values is None: + self._inequality_constraints_values = inequality_values + else: + self._inequality_constraints_values = np.append(self._inequality_constraints_values, + inequality_values, axis=0) + + def add_equality_constraints(self, equality_matrix, equality_values): + if equality_matrix is None: + logging.info("Empty equality constraint: ignoring!") + return + + self._assert_objective_function_is_set() + + if 1 in np.shape(equality_matrix) or len(np.shape(equality_matrix)) == 1: + equality_matrix = _as_line_matrix(equality_matrix) + else: + equality_matrix = _as_matrix(equality_matrix) + + equality_values = _as_matrix(equality_values) + assert np.shape(equality_matrix)[1] == self._n_variables + assert np.shape(equality_values)[1] == 1 + + if self._equality_constraints_matrix is None: + self._equality_constraints_matrix = equality_matrix + else: + self._equality_constraints_matrix = np.append(self._equality_constraints_matrix, + equality_matrix, axis=0) + + if self._equality_constraints_values is None: + self._equality_constraints_values = equality_values + else: + self._equality_constraints_values = np.append(self._equality_constraints_values, + equality_values, axis=0) + + def add_lower_bound(self, lower_bound): + if lower_bound is not None: + self._assert_objective_function_is_set() + self._lower_bound_values = np.array([lower_bound] * self._n_variables) + + def add_upper_bound(self, upper_bound): + if upper_bound is not None: + self._assert_objective_function_is_set() + self._upper_bound_values = np.array([upper_bound] * self._n_variables) + + def _convert_bounds_to_inequality_constraints(self): + self._assert_objective_function_is_set() + + if self._lower_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = -1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._lower_bound_values) + self._lower_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + if self._upper_bound_values is not None: + c_matrix = [] + for i in range(self._n_variables): + c_line = [0] * self._n_variables + c_line[i] = 1.0 + c_matrix.append(c_line) + + c_vector = _as_column_matrix(self._upper_bound_values) + self._upper_bound_values = None + self.add_inequality_constraints(np.matrix(c_matrix).T, c_vector) + + def _convert_to_cvxopt_matrices(self): + from cvxopt import matrix as cvxopt_matrix + + if self._quadratic_func is not None: + self._quadratic_func = cvxopt_matrix(self._quadratic_func) + + if self._linear_func is not None: + self._linear_func = cvxopt_matrix(self._linear_func) + else: + # CVXOPT needs this vector to be set even if it is not used, so we put zeros in it! + self._linear_func = cvxopt_matrix(np.zeros((self._n_variables, 1))) + + if self._inequality_constraints_matrix is not None: + self._inequality_constraints_matrix = cvxopt_matrix(self._inequality_constraints_matrix) + + if self._inequality_constraints_values is not None: + self._inequality_constraints_values = cvxopt_matrix(self._inequality_constraints_values) + + if self._equality_constraints_matrix is not None: + self._equality_constraints_matrix = cvxopt_matrix(self._equality_constraints_matrix) + + if self._equality_constraints_values is not None: + self._equality_constraints_values = cvxopt_matrix(self._equality_constraints_values) + + def _assert_objective_function_is_set(self): + assert self._n_variables is not None + + def solve(self, solver="cvxopt", feastol=1e-7, abstol=1e-7, reltol=1e-6, return_all_information=False): + + # Some solvers are very verbose, and we don't want them to pollute STDOUT or STDERR. + original_stdout = sys.stdout + original_stderr = sys.stderr + + ret = None + + # TODO: Repair + # if solver == "cvxopt": + # stdout_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stdout_logger, logging.DEBUG) + # sys.stdout = sl + + # stderr_logger = logging.getLogger('CVXOPT') + # sl = StreamToLogger(stderr_logger, logging.WARNING) + # sys.stderr = sl + + try: + if solver == "cvxopt": + from cvxopt.solvers import qp, lp, options + options['feastol'] = feastol + options['abstol'] = abstol + options['reltol'] = reltol + options['show_progress'] = False + + self._convert_bounds_to_inequality_constraints() + self._convert_to_cvxopt_matrices() + + if self._quadratic_func is not None: + ret = qp(self.quadratic_func, self.linear_func, self._inequality_constraints_matrix, + self._inequality_constraints_values, self._equality_constraints_matrix, + self._equality_constraints_values) + + else: + ret = lp(self.linear_func, + G=self._inequality_constraints_matrix, + h=self._inequality_constraints_values, + A=self._equality_constraints_matrix, + b=self._equality_constraints_values) + + #logging.info("Primal objective value = {}".format(ret['primal objective'])) + #logging.info("Dual objective value = {}".format(ret['dual objective'])) + + if not return_all_information: + ret = np.asarray(np.array(ret['x']).T[0]) + + elif solver == "cplex": + import cplex + p = cplex.Cplex() + p.objective.set_sense(p.objective.sense.minimize) + + # This is ugly. CPLEX wants a list of lists of lists. First dimension represents the lines of the QP + # matrix. Second dimension contains a pair of two elements: the indices of the variables in play (all of + # them...), and the values (columns of the QP matrix). + names = [str(x) for x in range(self._n_variables)] + p.variables.add(names=names) + + if self.quadratic_func is not None: + p_matrix = [] + for line in self._quadratic_func: + p_matrix.append([names, line.tolist()[0]]) + + p.objective.set_quadratic(p_matrix) + + if self.linear_func is not None: + p.objective.set_linear(zip(names, + np.asarray(self.linear_func.T).reshape(self.n_variables,).tolist())) + + if self._inequality_constraints_matrix is not None: + inequality_linear = [] + for line in self._inequality_constraints_matrix: + inequality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=inequality_linear, + rhs=np.asarray(self._inequality_constraints_values.T).tolist()[0], + senses="L"*len(self._inequality_constraints_values)) + + if self._equality_constraints_matrix is not None: + equality_linear = [] + for line in self._equality_constraints_matrix: + equality_linear.append([names, line.tolist()[0]]) + p.linear_constraints.add(lin_expr=equality_linear, + rhs=np.asarray(self._equality_constraints_values.T).tolist()[0], + senses="E"*len(self._equality_constraints_values)) + + if self._lower_bound_values is not None: + p.variables.set_lower_bounds(zip(names, self._lower_bound_values)) + + if self._upper_bound_values is not None: + p.variables.set_upper_bounds(zip(names, self._upper_bound_values)) + + p.solve() + + logging.info("Solution status = {} : {}".format(p.solution.get_status(), + p.solution.status[p.solution.get_status()])) + logging.info("Solution value = {}".format(p.solution.get_objective_value())) + + if not return_all_information: + ret = np.array(p.solution.get_values()) + else: + ret = {'primal': np.array(p.solution.get_values()), + 'dual': np.array(p.solution.get_dual_values())} + + elif solver == "pycpx": + # This shows how easy it is to use pycpx. However, it is much slower (as it is more versatile!). + + import pycpx + model = pycpx.CPlexModel(verbosity=2) + q = model.new(self.n_variables) + + if self._inequality_constraints_matrix is not None: + model.constrain(self._inequality_constraints_matrix * q <= self._inequality_constraints_values) + if self._equality_constraints_matrix is not None: + model.constrain(self._equality_constraints_matrix * q == self._equality_constraints_values) + if self._lower_bound_values is not None: + model.constrain(q >= self._lower_bound_values) + if self._upper_bound_values is not None: + model.constrain(q <= self._upper_bound_values) + + value = model.minimize(0.5 * q.T * self._quadratic_func * q + self.linear_func.T * q) + + logging.info("Solution value = {}".format(value)) + + if not return_all_information: + ret = np.array(model[q]) + else: + ret = model + + except: + raise + + finally: + sys.stdout = original_stdout + sys.stderr = original_stderr + + return ret + + + + + + +class DecisionStumpClassifier(BaseEstimator, ClassifierMixin): + """Generic Attribute Threshold Binary Classifier + + Attributes + ---------- + attribute_index : int + The attribute to consider for the classification. + threshold : float + The threshold value for classification rule. + direction : int, optional + A multiplicative constant (1 or -1) to choose the "direction" of the stump. Defaults to 1. If -1, the stump + will predict the "negative" class (generally -1 or 0), and if 1, the stump will predict the second class (generally 1). + + """ + def __init__(self, attribute_index, threshold, direction=1): + super(DecisionStumpClassifier, self).__init__() + self.attribute_index = attribute_index + self.threshold = threshold + self.direction = direction + + def fit(self, X, y): + # Only verify that we are in the binary classification setting, with support for transductive learning. + if isinstance(y, np.ma.MaskedArray): + self.classes_ = np.unique(y[np.logical_not(y.mask)]) + else: + self.classes_ = np.unique(y) + + # This label encoder is there for the predict function to be able to return any two classes that were used + # when fitting, for example {-1, 1} or {0, 1}. + self.le_ = LabelEncoder() + self.le_.fit(self.classes_) + self.classes_ = self.le_.classes_ + + assert len(self.classes_) == 2, "DecisionStumpsVoter only supports binary classification" + return self + + def predict(self, X): + """Returns the output of the classifier, on a sample X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + predictions : array-like, shape = [n_samples] + Predicted class labels. + + """ + check_is_fitted(self, 'classes_') + return self.le_.inverse_transform(np.argmax(self.predict_proba(X), axis=1)) + + def predict_proba(self, X): + """Compute probabilities of possible outcomes for samples in X. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + + Returns + ------- + avg : array-like, shape = [n_samples, n_classes] + Weighted average probability for each class per sample. + + """ + check_is_fitted(self, 'classes_') + X = np.asarray(X) + probas = np.zeros((X.shape[0], 2)) + positive_class = np.argwhere(X[:, self.attribute_index] > self.threshold) + negative_class = np.setdiff1d(range(X.shape[0]), positive_class) + probas[positive_class, 1] = 1.0 + probas[negative_class, 0] = 1.0 + + if self.direction == -1: + probas = 1 - probas + + return probas + + def reverse_decision(self): + self.direction *= -1 + + +class ClassifiersGenerator(BaseEstimator, TransformerMixin): + """Base class to create a set of voters using training samples, and then transform a set of examples in + the voters' output space. + + Attributes + ---------- + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + voters : ndarray of voter functions + Once fit, contains the voter functions. + + """ + def __init__(self, self_complemented=False): + super(ClassifiersGenerator, self).__init__() + self.self_complemented = self_complemented + + def fit(self, X, y=None): + """Generates the voters using training samples. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Input labels, usually determines the decision polarity of each voter. + + Returns + ------- + self + + """ + raise NotImplementedError + + def transform(self, X): + """Transforms the input points in a matrix of classification, using previously learned voters. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data to classify. + + Returns + ------- + ndarray of shape (n_samples, n_voters) + The voters' decision on each example. + + """ + check_is_fitted(self, 'estimators_') + return np.array([voter.predict(X) for voter in self.estimators_]).T + +class StumpsClassifiersGenerator(ClassifiersGenerator): + """Decision Stump Voters transformer. + + Parameters + ---------- + n_stumps_per_attribute : int, optional + Determines how many decision stumps will be created for each attribute. Defaults to 10. + No stumps will be created for attributes with only one possible value. + self_complemented : bool, optional + Whether or not a binary complement voter must be generated for each voter. Defaults to False. + + """ + def __init__(self, n_stumps_per_attribute=10, self_complemented=False): + super(StumpsClassifiersGenerator, self).__init__(self_complemented) + self.n_stumps_per_attribute = n_stumps_per_attribute + + def fit(self, X, y): + """Fits Decision Stump voters on a training set. + + Parameters + ---------- + X : ndarray of shape (n_samples, n_features) + Input data on which to base the voters. + y : ndarray of shape (n_labeled_samples,), optional + Only used to ensure that we are in the binary classification setting. + + Returns + ------- + self + + """ + minimums = np.min(X, axis=0) + maximums = np.max(X, axis=0) + ranges = (maximums - minimums) / (self.n_stumps_per_attribute + 1) + + self.estimators_ = [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, 1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + if self.self_complemented: + self.estimators_ += [DecisionStumpClassifier(i, minimums[i] + ranges[i] * stump_number, -1).fit(X, y) + for i in range(X.shape[1]) for stump_number in range(1, self.n_stumps_per_attribute + 1) + if ranges[i] != 0] + + self.estimators_ = np.asarray(self.estimators_) + return self + +def sign(array): + """Computes the elementwise sign of all elements of an array. The sign function returns -1 if x <=0 and 1 if x > 0. + Note that numpy's sign function can return 0, which is not desirable in most cases in Machine Learning algorithms. + + Parameters + ---------- + array : array-like + Input values. + + Returns + ------- + ndarray + An array with the signs of input elements. + + """ + signs = np.sign(array) + + signs[array == 0] = -1 + return signs + + +def zero_one_loss(y_target, y_estimate, confidences=1): + if len(y_target) == 0: + return 0.0 + return np.mean(y_target != y_estimate) + + +def zero_one_loss_per_example(y_target, y_estimate, confidences=1): + if len(y_target) == 0: + return 0.0 + return (y_target != y_estimate).astype(np.int) + + +class ResultsDataFrame(pd.DataFrame): + """A ResultsDataFrame is a DataFrame with the following information: + + - A 'dataset' column that contains the dataset name + - Hyperparamer columns, named 'hp__HPNAME', where HPNAME is the name of the hyperparameter + - Columns containing informations about that depend on the dataset and hyperparameters, for example the risk. + + """ + @property + def datasets_list(self): + """Returns the sorted list of datasets. + + """ + return sorted(set(self['dataset'])) + + @property + def hyperparameters_list(self): + """Returns a sorted list of hyperparameter names, without the 'hp__' prefix. + + """ + return sorted(column.split('hp__')[1] for column in self.columns if column.startswith('hp__')) + + @property + def hyperparameters_list_with_prefix(self): + return sorted(column for column in self.columns if column.startswith('hp__')) + + @property + def metrics_list(self): + return sorted(column for column in self.columns if not column.startswith('hp__') and column != 'dataset') + + @property + def hyperparameters_with_values(self): + """Returns a dictionary that contains the hyperparameter names (without the 'hp__' prefix), and + associated values that are present in the DataFrame. + + """ + hyperparameters = [column for column in self.columns if column.startswith('hp__')] + + hyperparameters_dict = {} + tmp_dict = self[hyperparameters].to_dict() + + for key, value in iteritems(tmp_dict): + hyperparameters_dict[key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(set(value.values())) + + return hyperparameters_dict + + @property + def hyperparameters_with_values_per_dataset(self): + """Returns a dictionary of dictionaries that contains for each dataset, the hyperparameter names (without the + 'hp__' prefix), and associated values that are present in the DataFrame. + + """ + hyperparameters = [column for column in self.columns if column.startswith('hp__')] + + hyperparameters_dict = {} + for dataset in self.datasets_list: + tmp_dict = self[self.dataset == dataset][hyperparameters].to_dict() + hyperparameters_dict[dataset] = {} + + for key, value in iteritems(tmp_dict): + hyperparameters_dict[dataset][key.split('hp__')[1]] = list(value.values())[0] if len(value) == 1 else sorted(value.values()) + + return hyperparameters_dict + + def results_optimizing_metric(self, metric_to_optimize='cv_mean__valid__zero_one_loss', minimize=True, tie_breaking_functions_ordered_dict=None): + function = min if minimize else max + + # We extract all the rows that have the best value for the metric to optimize. + optimal_results = self[self.groupby('dataset', sort=False)[metric_to_optimize].transform(function) == self[metric_to_optimize]] + + # We tie the breaks by applying the tie breaking functions (in the order of the dictionary). If hyperparameters are missing, we simply + # use the median for each hyperparameter, in a fixed (reproduceable) order. + if tie_breaking_functions_ordered_dict is None: + tie_breaking_functions_ordered_dict = OrderedDict() + else: + # Avoid side effects and ensures that the dictionary is an OrderedDict before we add missing hyperparameters. + tie_breaking_functions_ordered_dict = OrderedDict(tie_breaking_functions_ordered_dict.copy()) + + for hyperparameter in sorted(self.hyperparameters_list): + if hyperparameter not in tie_breaking_functions_ordered_dict.keys(): + tie_breaking_functions_ordered_dict[hyperparameter] = np.median + + for hyperparameter, tie_breaking_function in iteritems(tie_breaking_functions_ordered_dict): + optimal_results = optimal_results[optimal_results.groupby('dataset')['hp__' + hyperparameter].transform(partial(get_optimal_value_in_list, tie_breaking_function)) == optimal_results['hp__' + hyperparameter]] + + return ResultsDataFrame(optimal_results) + + def get_dataframe_with_metrics_as_one_column(self, metrics_to_keep=None): + new_dataframe = ResultsDataFrame() + + if metrics_to_keep is None: + metrics_to_keep = self.metrics_list + + for metric in metrics_to_keep: + columns = self.hyperparameters_list_with_prefix + [metric] + if 'dataset' in self: + columns.append('dataset') + + tmp = self.loc[:, columns] + tmp.columns = [c if c != metric else 'value' for c in tmp.columns] + tmp.loc[:, 'metric'] = metric + new_dataframe = new_dataframe.append(tmp, ignore_index=True) + + return new_dataframe + + +def get_optimal_value_in_list(optimum_function, values_list): + """Given a list of values and an optimal value, returns the value from the list that is the closest to the optimum, + given by optimum_function applied to the same list. + + >>> get_optimal_value_in_list(np.median, [2, 4, 5, 6]) + 4 + + """ + values_list = sorted(list(values_list)) + return values_list[np.argmin(np.array([scipy.spatial.distance.euclidean(value, optimum_function(values_list)) for value in values_list]))] diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index 6d6bfdd80b419a45748e226c93b7c9ee6a2023d5..811758e53032155bf8737ff3aa72cda64376d58b 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -146,6 +146,24 @@ def parseTheArgs(arguments): groupSCM.add_argument('--SCM_model_type', metavar='STRING', action='store', help='Max number of rules for SCM', default="conjunction") + groupCQBoost = parser.add_argument_group('CQBoost arguments') + groupCQBoost.add_argument('--CQB_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for CQBoost', default=0.001) + groupCQBoost.add_argument('--CQB_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for CQBoost', default=1e-08) + + groupCQBoostv2 = parser.add_argument_group('CQBoostv2 arguments') + groupCQBoostv2.add_argument('--CQB2_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for CQBoostv2', default=0.001) + groupCQBoostv2.add_argument('--CQB2_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for CQBoostv2', default=1e-08) + + groupCQBoostv21 = parser.add_argument_group('CQBoostv21 arguments') + groupCQBoostv21.add_argument('--CQB21_mu', metavar='FLOAT', type=float, action='store', + help='Set the mu parameter for CQBoostv2', default=0.001) + groupCQBoostv21.add_argument('--CQB21_epsilon', metavar='FLOAT', type=float, action='store', + help='Set the epsilon parameter for CQBoostv2', default=1e-08) + groupMumbo = parser.add_argument_group('Mumbo arguments') groupMumbo.add_argument('--MU_types', metavar='STRING', action='store', nargs="+", help='Determine which monoview classifier to use with Mumbo',