From 52c842674b3e23864d8895d6909083b69d86f3e9 Mon Sep 17 00:00:00 2001 From: Baptiste Bauvin <baptiste.bauvin@lis-lab.fr> Date: Mon, 1 Apr 2019 23:24:51 -0400 Subject: [PATCH] Added too much stuff to list --- .../Monoview/Additions/BoostUtils.py | 25 ++ .../{QarBoostUtils.py => CGDescUtils.py} | 7 +- .../Monoview/Additions/CQBoostUtils.py | 7 +- .../Monoview/Additions/MinCQUtils.py | 293 ++++++++++++++++ .../Monoview/Additions/PregenUtils.py | 9 +- .../Monoview/ExecClassifMonoView.py | 3 + .../MonoviewClassifiers/AdaboostPregen.py | 3 +- .../MonoviewClassifiers/AdaboostPregen10.py | 34 ++ .../MonoviewClassifiers/AdaboostPregenTree.py | 98 ++++++ .../MonoviewClassifiers/CGDesc.py | 5 +- .../MonoviewClassifiers/CGDesc10.py | 25 ++ .../MonoviewClassifiers/CGDescTree.py | 51 +++ .../MonoviewClassifiers/CGreed.py | 5 +- .../MonoviewClassifiers/CQBoost.py | 11 +- .../MonoviewClassifiers/CQBoostTree.py | 65 ++++ .../MonoviewClassifiers/MinCQ.py | 2 +- .../MonoviewClassifiers/MinCQGraalpy.py | 325 +----------------- .../MonoviewClassifiers/MinCQGraalpyTree.py | 71 ++++ .../MonoviewClassifiers/QarBoost.py | 2 +- .../MonoviewClassifiers/QarBoostNC3.py | 2 +- .../MonoviewClassifiers/QarBoostv2.py | 2 +- .../MonoviewClassifiers/QarBoostv3.py | 2 +- .../MonoviewClassifiers/SCMPregen.py | 46 +-- .../MonoviewClassifiers/SCMPregenTree.py | 83 +++++ .../MonoviewClassifiers/SCMSparsity.py | 61 ++-- .../MonoviewClassifiers/SCMSparsityTree.py | 92 +++++ .../utils/GetMultiviewDb.py | 27 ++ .../utils/execution.py | 120 +++++++ 28 files changed, 1054 insertions(+), 422 deletions(-) rename multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/{QarBoostUtils.py => CGDescUtils.py} (98%) create mode 100644 multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc10.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDescTree.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostTree.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpyTree.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregenTree.py create mode 100644 multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsityTree.py diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py index 9dc557a3..3f32dbaa 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/BoostUtils.py @@ -1,6 +1,7 @@ import numpy as np from sklearn.base import BaseEstimator, ClassifierMixin, TransformerMixin from sklearn.preprocessing import LabelEncoder +from sklearn.tree import DecisionTreeClassifier from sklearn.utils.validation import check_is_fitted import sys import matplotlib.pyplot as plt @@ -188,6 +189,30 @@ class ClassifiersGenerator(BaseEstimator, TransformerMixin): # # def fit(self, X, y=None): +class TreeClassifiersGenerator(ClassifiersGenerator): + + def __init__(self, random_state, max_depth=2, self_complemented=True, criterion="gini", splitter="best", n_trees=100, distribution_type="uniform", low=0, high=10): + super(TreeClassifiersGenerator, self).__init__(self_complemented) + self.max_depth=max_depth + self.criterion=criterion + self.splitter=splitter + self.n_trees=n_trees + self.random_state=random_state + self.distribution_type = distribution_type + self.low = low + self.high = high + + def fit(self, X, y=None): + estimators_ = [] + self.distributions = np.zeros((self.n_trees, X.shape[0])) + distrib_method = getattr(self.random_state, self.distribution_type) + for i in range(self.n_trees): + self.distributions[i,:] = distrib_method(self.low, self.high, size=X.shape[0]) + estimators_.append(DecisionTreeClassifier(criterion=self.criterion, splitter=self.splitter, max_depth=self.max_depth).fit(X, y, sample_weight=self.distributions[i,:])) + self.estimators_ = np.asarray(estimators_) + return self + + class StumpsClassifiersGenerator(ClassifiersGenerator): """Decision Stump Voters transformer. diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/QarBoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py similarity index 98% rename from multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/QarBoostUtils.py rename to multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py index b2fd773d..9aff7ca3 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/QarBoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CGDescUtils.py @@ -10,7 +10,7 @@ import time import matplotlib.pyplot as plt from .BoostUtils import StumpsClassifiersGenerator, sign, BaseBoost, \ - getInterpretBase, get_accuracy_graph + getInterpretBase, get_accuracy_graph, TreeClassifiersGenerator from ..MonoviewUtils import change_label_to_zero, change_label_to_minus from ... import Metrics @@ -328,10 +328,13 @@ class ColumnGenerationClassifierQar(BaseEstimator, ClassifierMixin, BaseBoost): def init_hypotheses(self, X, y): """Inintialization for the hyptotheses used to build the boosted vote""" - if self.estimators_generator is None: + if self.estimators_generator is "Stumps": self.estimators_generator = StumpsClassifiersGenerator( n_stumps_per_attribute=self.n_stumps, self_complemented=self.self_complemented) + if self.estimators_generator is "Trees": + self.estimators_generator = TreeClassifiersGenerator(self.random_state, n_trees=self.n_stumps, max_depth=self.max_depth, + self_complemented=self.self_complemented) self.estimators_generator.fit(X, y) self.classification_matrix = self._binary_classification_matrix(X) self.train_shape = self.classification_matrix.shape diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CQBoostUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CQBoostUtils.py index 182c2e16..05521371 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CQBoostUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/CQBoostUtils.py @@ -9,7 +9,7 @@ import numpy as np import time import math -from .BoostUtils import StumpsClassifiersGenerator, ConvexProgram, sign, BaseBoost +from .BoostUtils import StumpsClassifiersGenerator, ConvexProgram, sign, BaseBoost, TreeClassifiersGenerator from ... import Metrics @@ -23,6 +23,7 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin, BaseBoost): self.mu = mu self.train_time = 0 self.plotted_metric = Metrics.zero_one_loss + self.random_state = random_state def fit(self, X, y): start = time.time() @@ -31,8 +32,10 @@ class ColumnGenerationClassifier(BaseEstimator, ClassifierMixin, BaseBoost): y[y == 0] = -1 - if self.estimators_generator is None: + if self.estimators_generator is "Stumps": self.estimators_generator = StumpsClassifiersGenerator(n_stumps_per_attribute=self.n_stumps, self_complemented=True) + elif self.estimators_generator is "Trees": + self.estimators_generator = TreeClassifiersGenerator( self.random_state, max_depth=self.max_depth, n_trees=self.n_stumps, self_complemented=True) self.estimators_generator.fit(X, y) self.classification_matrix = self._binary_classification_matrix(X) diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py new file mode 100644 index 00000000..afa55be5 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/MinCQUtils.py @@ -0,0 +1,293 @@ +# -*- coding: utf-8 -*- +"""MinCq algorithm. + +Related papers: +[1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) +[2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2015) + +""" +from __future__ import print_function, division, absolute_import +from operator import xor + +import numpy as np +from sklearn.utils.validation import check_X_y +from sklearn.ensemble import VotingClassifier +from sklearn.manifold import SpectralEmbedding +from sklearn.utils.graph import graph_laplacian +from sklearn.preprocessing import LabelEncoder + + +from .BoostUtils import ConvexProgram, StumpsClassifiersGenerator +from ..MonoviewUtils import BaseMonoviewClassifier, CustomUniform, change_label_to_zero, change_label_to_minus + + +class MinCqClassifier(VotingClassifier): + """ + Base MinCq algorithm learner. See [1, 2]. + This version is an attempt of creating a more general version of MinCq, that handles multiclass classfication. + For binary classification, use RegularizedMinCqClassifer. + + Parameters + ---------- + mu : float + The fixed value of the first moment of the margin. + + """ + def __init__(self, estimators_generator=None, estimators=None, mu=0.001, omega=0.5, use_binary=False, zeta=0, gamma=1, n_neighbors=5): + if estimators is None: + estimators = [] + + super().__init__(estimators=estimators, voting='soft', flatten_transform=False) + self.estimators_generator = estimators_generator + self.mu = mu + self.omega = omega + self.use_binary = use_binary + self.zeta = zeta + self.gamma = gamma + self.n_neighbors = n_neighbors + + def fit(self, X, y): + """Fit the estimators and learn the weights. + + Parameters + ---------- + X : array-like, shape = [n_samples, n_features] + Training vectors, where n_samples is the number of samples and + n_features is the number of features. + y : array-like, shape = [n_samples] + Target values. If y is a masked-array (numpy.ma), the masked values are unlabeled examples. + + Returns + ------- + self : object + + """ + # Validations + assert 0 < self.mu <= 1, "MinCqClassifier: mu parameter must be in (0, 1]" + assert xor(bool(self.estimators_generator), bool(self.estimators)), "MinCqClassifier: exactly one of estimator_generator or estimators must be used." + X, y = check_X_y(X, change_label_to_minus(y)) + + # Fit the estimators using VotingClassifier's fit method. This will also fit a LabelEncoder that can be + # used to "normalize" labels (0, 1, 2, ...). In the case of binary classification, the two classes will be 0 and 1. + # First, ensure that the weights are reset to None (as cloning a VotingClassifier keeps the weights) + self.weights = None + # TODO: Ensure estimators can deal with masked arrays + + # If we use an estimator generator, use the data-dependant estimator generator to generate them, and fit again. + if self.estimators: + super().fit(X, y) + + else: + self.le_ = LabelEncoder() + self.le_.fit(y) + self.clean_me =True + + if isinstance(y, np.ma.MaskedArray): + transformed_y = np.ma.MaskedArray(self.le_.transform(y), y.mask) + else: + # transformed_y = self.le_.transform(y) + transformed_y = y + + self.estimators_generator.fit(X, transformed_y) + self.estimators = [('ds{}'.format(i), estimator) for i, estimator in enumerate(self.estimators_generator.estimators_)] + super().fit(X, y) + + # Preparation and resolution of the quadratic program + # logger.info("Preparing and solving QP...") + self.weights = self._solve(X, y) + if self.clean_me: + self.estimators = [] + # print(self.weights.shape) + # print(np.unique(self.weights)[0:10]) + # import pdb;pdb.set_trace() + self.train_cbound = 1 - (1.0/X.shape[0])*(np.sum(np.multiply(change_label_to_minus(y), np.average(self._binary_classification_matrix(X), axis=1, weights=self.weights)))**2)/(np.sum(np.average(self._binary_classification_matrix(X), axis=1, weights=self.weights)**2)) + return self + + def _binary_classification_matrix(self, X): + probas = self.transform(X) + predicted_labels = np.argmax(probas, axis=2) + predicted_labels[predicted_labels == 0] = -1 + values = np.max(probas, axis=2) + return (predicted_labels * values).T + + def _multiclass_classification_matrix(self, X, y): + probas = self.transform(X).swapaxes(0, 1) + matrix = probas[np.arange(probas.shape[0]), :, y] + + return (matrix - self.omega) + + def predict(self, X): + if not self.estimators: + self.estimators = [('ds{}'.format(i), estimator) for i, estimator in + enumerate(self.estimators_generator.estimators_)] + self.clean_me = True + pred = super().predict(X) + if self.clean_me: + self.estimators = [] + return change_label_to_zero(pred) + + def _solve(self, X, y): + y = self.le_.transform(y) + + if self.use_binary: + assert len(self.le_.classes_) == 2 + + # TODO: Review the number of labeled examples when adding back the support for transductive learning. + classification_matrix = self._binary_classification_matrix(X) + + # We use {-1, 1} labels. + binary_labels = np.copy(y) + binary_labels[y == 0] = -1 + + multi_matrix = binary_labels.reshape((len(binary_labels), 1)) * classification_matrix + + else: + multi_matrix = self._multiclass_classification_matrix(X, y) + + n_examples, n_voters = np.shape(multi_matrix) + ftf = 1.0 / n_examples * multi_matrix.T.dot(multi_matrix) + yf = np.mean(multi_matrix, axis=0) + + # Objective function. + objective_matrix = 2 * ftf + objective_vector = None + + # Equality constraints (first moment of the margin equal to mu, Q sums to one) + equality_matrix = np.vstack((yf.reshape((1, n_voters)), np.ones((1, n_voters)))) + equality_vector = np.array([self.mu, 1.0]) + + # Lower and upper bounds, no quasi-uniformity. + lower_bound = 0.0 + # TODO: In the case of binary classification, no upper bound will give + # bad results. Using 1/n works, as it brings back the l_infinity + # regularization normally given by the quasi-uniformity constraint. + # upper_bound = 2.0/n_voters + upper_bound = None + + weights = self._solve_qp(objective_matrix, objective_vector, equality_matrix, equality_vector, lower_bound, upper_bound) + + # Keep learning information for further use. + self.learner_info_ = {} + + # We count the number of non-zero weights, including the implicit voters. + # TODO: Verify how we define non-zero weights here, could be if the weight is near 1/2n. + n_nonzero_weights = np.sum(np.asarray(weights) > 1e-12) + n_nonzero_weights += np.sum(np.asarray(weights) < 1.0 / len(self.estimators_) - 1e-12) + self.learner_info_.update(n_nonzero_weights=n_nonzero_weights) + + return weights + + def _solve_qp(self, objective_matrix, objective_vector, equality_matrix, equality_vector, lower_bound, upper_bound): + try: + qp = ConvexProgram() + qp.quadratic_func, qp.linear_func = objective_matrix, objective_vector + qp.add_equality_constraints(equality_matrix, equality_vector) + qp.add_lower_bound(lower_bound) + qp.add_upper_bound(upper_bound) + return qp.solve() + + except Exception: + # logger.warning("Error while solving the quadratic program.") + raise + + +class RegularizedBinaryMinCqClassifier(MinCqClassifier): + """MinCq, version published in [1] and [2], where the regularization comes from the enforced quasi-uniformity + of the posterior distributino on the symmetric hypothesis space. This version only works with {-1, 1} labels. + + [1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) + [2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2015) + + """ + def fit(self, X, y): + import time + beg = time.time() + # We first fit and learn the weights. + super().fit(X, y) + + # Validations + if isinstance(y, np.ma.MaskedArray): + assert len(self.classes_[np.where(np.logical_not(self.classes_.mask))]) == 2, "RegularizedBinaryMinCqClassifier: only supports binary classification." + else: + assert len(self.classes_), "RegularizedBinaryMinCqClassifier: only supports binary classification." + + # Then we "reverse" the negative weights and their associated voter's output. + for i, weight in enumerate(self.weights): + if weight < 0: + # logger.debug("Reversing decision of a binary voter") + self.weights[i] *= -1 + self.estimators_[i].reverse_decision() + end=time.time() + self.train_time = end-beg + return self + + def _solve(self, X, y): + if isinstance(y, np.ma.MaskedArray): + y = np.ma.MaskedArray(self.le_.transform(y), y.mask) + else: + y = self.le_.transform(y) + + classification_matrix = self._binary_classification_matrix(X) + n_examples, n_voters = np.shape(classification_matrix) + + if self.zeta == 0: + np.transpose(classification_matrix) + ftf = np.dot(np.transpose(classification_matrix),classification_matrix) + else: + I = np.eye(n_examples) + L = build_laplacian(X, n_neighbors=self.n_neighbors) + ftf = classification_matrix.T.dot(I + (self.zeta / n_examples) * L).dot(classification_matrix) + + # We use {-1, 1} labels. + binary_labels = np.ma.copy(y) + binary_labels[np.ma.where(y == 0)] = -1 + + # Objective function. + ftf_mean = np.mean(ftf, axis=1) + objective_matrix = 2.0 / n_examples * ftf + objective_vector = -1.0 / n_examples * ftf_mean.T + + # Equality constraint: first moment of the margin fixed to mu, only using labeled examples. + if isinstance(y, np.ma.MaskedArray): + labeled = np.where(np.logical_not(y.mask))[0] + binary_labels = binary_labels[labeled] + else: + labeled = range(len(y)) + + yf = binary_labels.T.dot(classification_matrix[labeled]) + yf_mean = np.mean(yf) + equality_matrix = 2.0 / len(labeled) * yf + equality_vector = self.mu + 1.0 / len(labeled) * yf_mean + + # Lower and upper bounds (quasi-uniformity constraints) + lower_bound = 0.0 + upper_bound = 1.0 / n_voters + + try: + weights = self._solve_qp(objective_matrix, objective_vector, equality_matrix, equality_vector, lower_bound, upper_bound) + except ValueError as e: + if "domain error" in e.args: + weights = np.ones(len(self.estimators_)) + + + # Keep learning information for further use. + self.learner_info_ = {} + print(np.unique(weights)) + + # We count the number of non-zero weights, including the implicit voters. + # TODO: Verify how we define non-zero weights here, could be if the weight is near 1/2n. + n_nonzero_weights = np.sum(np.asarray(weights) > 1e-12) + n_nonzero_weights += np.sum(np.asarray(weights) < 1.0 / len(self.estimators_) - 1e-12) + self.learner_info_.update(n_nonzero_weights=n_nonzero_weights) + + # Conversion of the weights of the n first voters to weights on the implicit 2n voters. + # See Section 7.1 of [2] for an explanation. + # return np.array([2 * q - 1.0 / len(self.estimators_) for q in weights]) + return np.array(weights) + +def build_laplacian(X, n_neighbors=None): + clf = SpectralEmbedding(n_neighbors=n_neighbors) + clf.fit(X) + w = clf.affinity_matrix_ + laplacian = graph_laplacian(w, normed=True) + return laplacian diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/PregenUtils.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/PregenUtils.py index 196c97ec..7bcf4281 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/PregenUtils.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/Additions/PregenUtils.py @@ -1,16 +1,19 @@ from ..MonoviewUtils import change_label_to_minus -from .BoostUtils import StumpsClassifiersGenerator, BaseBoost +from .BoostUtils import StumpsClassifiersGenerator, BaseBoost, TreeClassifiersGenerator +import numpy as np class PregenClassifier(BaseBoost): - def pregen_voters(self, X, y=None): + def pregen_voters(self, X, y=None, generator="Stumps"): if y is not None: neg_y = change_label_to_minus(y) - if self.estimators_generator is None: + if generator is "Stumps": self.estimators_generator = StumpsClassifiersGenerator( n_stumps_per_attribute=self.n_stumps, self_complemented=self.self_complemented) + elif generator is "Trees": + self.estimators_generator = TreeClassifiersGenerator(self.random_state, n_trees=self.n_stumps, max_depth=self.max_depth) self.estimators_generator.fit(X, neg_y) else: neg_y=None diff --git a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py index 8a311c5d..9156bf76 100644 --- a/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py +++ b/multiview_platform/MonoMultiViewClassifiers/Monoview/ExecClassifMonoView.py @@ -17,6 +17,7 @@ from .. import MonoviewClassifiers from .analyzeResult import execute from ..utils.Dataset import getValue, extractSubset from . import MonoviewUtils +from ..utils.GetMultiviewDb import TanhNormalizer # Author-Info __author__ = "Nikolas Huelsmann, Baptiste BAUVIN" @@ -74,6 +75,7 @@ def ExecMonoview(directory, X, Y, name, labelsNames, classificationIndices, KFol logging.debug("Start:\t Training") classifier = getattr(classifierModule, CL_type)(randomState, **clKWARGS) + classifier.fit(X_train, y_train) # NB_CORES=nbCores, logging.debug("Done:\t Training") @@ -197,6 +199,7 @@ def saveResults(stringAnalysis, outputFileName, full_labels_pred, y_train_pred, + if __name__ == '__main__': """The goal of this part of the module is to be able to execute a monoview experimentation on a node of a cluster independently. diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py index 0eaa134d..e378fd05 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen.py @@ -32,7 +32,7 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, PregenClassifie self.plotted_metric = Metrics.zero_one_loss self.plotted_metric_name = "zero_one_loss" self.step_predictions = None - self.estimators_generator = None + self.estimators_generator = "Stumps" self.n_stumps=n_stumps self.self_complemented=self_complemeted @@ -94,7 +94,6 @@ class AdaboostPregen(AdaBoostClassifier, BaseMonoviewClassifier, PregenClassifie # else: # neg_y=None # classification_matrix = self._binary_classification_matrix(X) - return classification_matrix, neg_y def formatCmdArgs(args): """Used to format kwargs for the parsed args""" diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py new file mode 100644 index 00000000..389dc323 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregen10.py @@ -0,0 +1,34 @@ +from sklearn.tree import DecisionTreeClassifier +from .AdaboostPregen import AdaboostPregen + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +class AdaboostPregen10(AdaboostPregen): + + def __init__(self, random_state=None, n_estimators=50, + base_estimator=None, n_stumps=1, self_complemeted=True , **kwargs): + super(AdaboostPregen10, self).__init__( + random_state=random_state, + n_estimators=n_estimators, + base_estimator=base_estimator, + n_stumps=10, + self_complemeted=self_complemeted + ) +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {'n_estimators': args.AdP_n_est, + 'base_estimator': DecisionTreeClassifier(max_depth=1), + } + return kwargsDict + + +def paramsToSet(nIter, random_state): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"n_estimators": random_state.randint(1, 500), + "base_estimator": None}) + return paramsSet diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py new file mode 100644 index 00000000..16378a77 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/AdaboostPregenTree.py @@ -0,0 +1,98 @@ +from sklearn.ensemble import AdaBoostClassifier +from sklearn.tree import DecisionTreeClassifier +import numpy as np +import time +from sklearn.metrics import accuracy_score + +from ..Monoview.MonoviewUtils import CustomRandint, BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero +from ..Monoview.Additions.BoostUtils import get_accuracy_graph +from .. import Metrics +from ..Monoview.Additions.BoostUtils import get_accuracy_graph, StumpsClassifiersGenerator, BaseBoost +from ..Monoview.Additions.PregenUtils import PregenClassifier + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +class AdaboostPregenTree(AdaBoostClassifier, BaseMonoviewClassifier, PregenClassifier): + + def __init__(self, random_state=None, n_estimators=50, + base_estimator=None, n_stumps=1, self_complemeted=True, max_depth=2 , **kwargs): + super(AdaboostPregenTree, self).__init__( + random_state=random_state, + n_estimators=n_estimators, + base_estimator=base_estimator, + algorithm="SAMME" + ) + self.param_names = ["n_estimators", "base_estimator", "n_stumps", "random_state", "max_depth"] + self.classed_params = ["base_estimator"] + self.distribs = [CustomRandint(low=1, high=500), [DecisionTreeClassifier(max_depth=1)], [n_stumps], [random_state], [max_depth]] + self.weird_strings = {"base_estimator": "class_name"} + self.plotted_metric = Metrics.zero_one_loss + self.plotted_metric_name = "zero_one_loss" + self.step_predictions = None + self.estimators_generator = "Trees" + self.n_stumps=n_stumps + self.max_depth = max_depth + self.self_complemented=self_complemeted + self.random_state = random_state + + def fit(self, X, y, sample_weight=None): + begin = time.time() + pregen_X, pregen_y = self.pregen_voters(X, y) + super(AdaboostPregenTree, self).fit(pregen_X, pregen_y, sample_weight=sample_weight) + end = time.time() + self.train_time = end-begin + self.train_shape = pregen_X.shape + self.base_predictions = np.array([change_label_to_zero(estim.predict(pregen_X)) for estim in self.estimators_]) + self.metrics = np.array([self.plotted_metric.score(change_label_to_zero(pred), y) for pred in self.staged_predict(pregen_X)]) + self.bounds = np.array([np.prod(np.sqrt(1-4*np.square(0.5-self.estimator_errors_[:i+1]))) for i in range(self.estimator_errors_.shape[0])]) + + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def predict(self, X): + begin = time.time() + pregen_X, _ = self.pregen_voters(X) + pred = super(AdaboostPregenTree, self).predict(pregen_X) + end = time.time() + self.pred_time = end - begin + if pregen_X.shape != self.train_shape: + self.step_predictions = np.array([change_label_to_zero(step_pred) for step_pred in self.staged_predict(pregen_X)]) + return change_label_to_zero(pred) + + + + def getInterpret(self, directory, y_test): + interpretString = "" + interpretString += self.getFeatureImportance(directory) + interpretString += "\n\n Estimator error | Estimator weight\n" + interpretString += "\n".join([str(error) +" | "+ str(weight/sum(self.estimator_weights_)) for error, weight in zip(self.estimator_errors_, self.estimator_weights_)]) + step_test_metrics = np.array([self.plotted_metric.score(y_test, step_pred) for step_pred in self.step_predictions]) + get_accuracy_graph(step_test_metrics, "AdaboostPregen", directory + "test_metrics.png", + self.plotted_metric_name, set="test") + get_accuracy_graph(self.metrics, "AdaboostPregen", directory+"metrics.png", self.plotted_metric_name, bounds=list(self.bounds), bound_name="boosting bound") + np.savetxt(directory + "test_metrics.csv", step_test_metrics, delimiter=',') + np.savetxt(directory + "train_metrics.csv", self.metrics, delimiter=',') + np.savetxt(directory + "times.csv", np.array([self.train_time, self.pred_time]), delimiter=',') + return interpretString + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {'n_estimators': args.AdPT_n_est, + 'base_estimator': DecisionTreeClassifier(max_depth=1), + 'n_stumps':args.AdPT_trees, + "max_depth":args.AdPT_max_depth} + return kwargsDict + + +def paramsToSet(nIter, random_state): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"n_estimators": random_state.randint(1, 500), + "base_estimator": None}) + return paramsSet diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py index e22ea27a..76ba5dae 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc.py @@ -1,6 +1,6 @@ from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomRandint from ..Monoview.Additions.BoostUtils import getInterpretBase -from ..Monoview.Additions.QarBoostUtils import ColumnGenerationClassifierQar +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar class CGDesc(ColumnGenerationClassifierQar, BaseMonoviewClassifier): @@ -14,7 +14,8 @@ class CGDesc(ColumnGenerationClassifierQar, BaseMonoviewClassifier): random_start=False, n_stumps=n_stumps, use_r=True, - c_bound_sol=True + c_bound_sol=True, + estimators_generator="Stumps" ) self.param_names = ["n_max_iterations", "n_stumps", "random_state"] self.distribs = [CustomRandint(low=2, high=1000), [n_stumps], diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc10.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc10.py new file mode 100644 index 00000000..04d61981 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDesc10.py @@ -0,0 +1,25 @@ +from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomRandint +from ..Monoview.Additions.BoostUtils import getInterpretBase +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar +from .CGDesc import CGDesc + +class CGDesc10(CGDesc): + + def __init__(self, random_state=None, n_max_iterations=500, n_stumps=1, **kwargs): + super(CGDesc10, self).__init__(n_max_iterations=n_max_iterations, + random_state=random_state, + n_stumps=10,) + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"n_stumps":args.CGD_stumps, + "n_max_iterations":args.CGD_n_iter} + return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet \ No newline at end of file diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDescTree.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDescTree.py new file mode 100644 index 00000000..41344ede --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGDescTree.py @@ -0,0 +1,51 @@ +from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomRandint +from ..Monoview.Additions.BoostUtils import getInterpretBase +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar + + +class CGDescTree(ColumnGenerationClassifierQar, BaseMonoviewClassifier): + + def __init__(self, random_state=None, n_max_iterations=500, n_stumps=1, max_depth=2, **kwargs): + super(CGDescTree, self).__init__(n_max_iterations=n_max_iterations, + random_state=random_state, + self_complemented=True, + twice_the_same=True, + c_bound_choice=True, + random_start=False, + n_stumps=n_stumps, + use_r=True, + c_bound_sol=True, + estimators_generator="Trees" + ) + self.max_depth = max_depth + self.param_names = ["n_max_iterations", "n_stumps", "random_state", "max_depth"] + self.distribs = [CustomRandint(low=2, high=1000), [n_stumps], + [random_state], [max_depth]] + self.classed_params = [] + self.weird_strings = {} + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def getInterpret(self, directory, y_test): + return self.getInterpretQar(directory, y_test) + + def get_name_for_fusion(self): + return "CGDT" + + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"n_stumps":args.CGDT_trees, + "n_max_iterations":args.CGDT_n_iter, + "max_depth": args.CGDT_max_depth} + return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet \ No newline at end of file diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGreed.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGreed.py index e0e0916f..9e25e40b 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGreed.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CGreed.py @@ -1,6 +1,6 @@ from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomRandint from ..Monoview.Additions.BoostUtils import getInterpretBase -from ..Monoview.Additions.QarBoostUtils import ColumnGenerationClassifierQar +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar class CGreed(ColumnGenerationClassifierQar, BaseMonoviewClassifier): @@ -14,7 +14,8 @@ class CGreed(ColumnGenerationClassifierQar, BaseMonoviewClassifier): random_start=False, n_stumps=n_stumps, use_r=True, - c_bound_sol=True + c_bound_sol=True, + estimators_generator="Stumps" ) self.param_names = ["n_max_iterations", "n_stumps", "random_state"] diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py index f52385d1..7d58e82f 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoost.py @@ -11,7 +11,8 @@ class CQBoost(ColumnGenerationClassifier, BaseMonoviewClassifier): super(CQBoost, self).__init__( random_state=random_state, mu=mu, - epsilon=epsilon + epsilon=epsilon, + estimators_generator="Stumps" ) self.param_names = ["mu", "epsilon", "n_stumps", "random_state"] self.distribs = [CustomUniform(loc=0.5, state=1.0, multiplier="e-"), @@ -24,14 +25,6 @@ class CQBoost(ColumnGenerationClassifier, BaseMonoviewClassifier): else: self.nbCores = kwargs["nbCores"] - def fit(self, X, y): - if self.nbCores == 1: - pass - super(CQBoost, self).fit(X,y) - if self.nbCores == 1: - # os.environ['OMP_NUM_THREADS'] = num_threads - pass - def canProbas(self): """Used to know if the classifier can return label probabilities""" diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostTree.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostTree.py new file mode 100644 index 00000000..585e7d59 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/CQBoostTree.py @@ -0,0 +1,65 @@ +from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint, BaseMonoviewClassifier +from ..Monoview.Additions.CQBoostUtils import ColumnGenerationClassifier +from ..Monoview.Additions.BoostUtils import getInterpretBase + +import numpy as np +import os + +class CQBoostTree(ColumnGenerationClassifier, BaseMonoviewClassifier): + + def __init__(self, random_state=None, mu=0.01, epsilon=1e-06, n_stumps=1, max_depth=2, **kwargs): + super(CQBoostTree, self).__init__( + random_state=random_state, + mu=mu, + epsilon=epsilon, + estimators_generator="Trees" + ) + self.param_names = ["mu", "epsilon", "n_stumps", "random_state", "max_depth"] + self.distribs = [CustomUniform(loc=0.5, state=1.0, multiplier="e-"), + CustomRandint(low=1, high=15, multiplier="e-"), [n_stumps], [random_state], [max_depth]] + self.classed_params = [] + self.weird_strings = {} + self.n_stumps = n_stumps + self.max_depth = max_depth + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def getInterpret(self, directory, y_test): + np.savetxt(directory + "train_metrics.csv", self.train_metrics, delimiter=',') + np.savetxt(directory + "c_bounds.csv", self.c_bounds, + delimiter=',') + np.savetxt(directory + "y_test_step.csv", self.step_decisions, + delimiter=',') + step_metrics = [] + for step_index in range(self.step_decisions.shape[1] - 1): + step_metrics.append(self.plotted_metric.score(y_test, + self.step_decisions[:, + step_index])) + step_metrics = np.array(step_metrics) + np.savetxt(directory + "step_test_metrics.csv", step_metrics, + delimiter=',') + return getInterpretBase(self, directory, "CQBoost", self.weights_, y_test) + + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"mu": args.CQBT_mu, + "epsilon": args.CQBT_epsilon, + "n_stumps":args.CQBT_trees, + "max_depth":args.CQBT_max_depth} + return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"mu": 10**-randomState.uniform(0.5, 1.5), + "epsilon": 10**-randomState.randint(1, 15)}) + return paramsSet diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py index d8c6efe8..6f3e3f8f 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQ.py @@ -1,6 +1,6 @@ from ..Monoview.MonoviewUtils import CustomUniform, CustomRandint, BaseMonoviewClassifier from ..Monoview.Additions.BoostUtils import getInterpretBase -from ..Monoview.Additions.QarBoostUtils import ColumnGenerationClassifierQar +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar #### Algorithm code #### diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpy.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpy.py index 5cc965a2..f7f7fbfb 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpy.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpy.py @@ -1,302 +1,9 @@ -# -*- coding: utf-8 -*- -"""MinCq algorithm. - -Related papers: -[1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) -[2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2015) - -""" -from __future__ import print_function, division, absolute_import -import logging -from operator import xor - import numpy as np -from scipy.linalg import sqrtm -from scipy.spatial.distance import pdist, squareform -from sklearn.metrics.pairwise import rbf_kernel -from sklearn.utils.validation import check_X_y -from sklearn.ensemble import VotingClassifier -from sklearn.manifold import SpectralEmbedding -from sklearn.utils.graph import graph_laplacian -from sklearn.preprocessing import LabelEncoder - - -from ..Monoview.Additions.BoostUtils import ConvexProgram, StumpsClassifiersGenerator -from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomUniform, change_label_to_zero, change_label_to_minus -from ..Metrics import zero_one_loss - -# logger = logging.getLogger('MinCq') - -class MinCqClassifier(VotingClassifier): - """ - Base MinCq algorithm learner. See [1, 2]. - This version is an attempt of creating a more general version of MinCq, that handles multiclass classfication. - For binary classification, use RegularizedMinCqClassifer. - - Parameters - ---------- - mu : float - The fixed value of the first moment of the margin. - - """ - def __init__(self, estimators_generator=None, estimators=None, mu=0.001, omega=0.5, use_binary=False, zeta=0, gamma=1, n_neighbors=5): - if estimators is None: - estimators = [] - - super().__init__(estimators=estimators, voting='soft', flatten_transform=False) - self.estimators_generator = estimators_generator - self.mu = mu - self.omega = omega - self.use_binary = use_binary - self.zeta = zeta - self.gamma = gamma - self.n_neighbors = n_neighbors - - def fit(self, X, y): - """Fit the estimators and learn the weights. - - Parameters - ---------- - X : array-like, shape = [n_samples, n_features] - Training vectors, where n_samples is the number of samples and - n_features is the number of features. - y : array-like, shape = [n_samples] - Target values. If y is a masked-array (numpy.ma), the masked values are unlabeled examples. - - Returns - ------- - self : object - - """ - # Validations - assert 0 < self.mu <= 1, "MinCqClassifier: mu parameter must be in (0, 1]" - assert xor(bool(self.estimators_generator), bool(self.estimators)), "MinCqClassifier: exactly one of estimator_generator or estimators must be used." - X, y = check_X_y(X, change_label_to_minus(y)) - - # Fit the estimators using VotingClassifier's fit method. This will also fit a LabelEncoder that can be - # used to "normalize" labels (0, 1, 2, ...). In the case of binary classification, the two classes will be 0 and 1. - # First, ensure that the weights are reset to None (as cloning a VotingClassifier keeps the weights) - self.weights = None - # TODO: Ensure estimators can deal with masked arrays - - # If we use an estimator generator, use the data-dependant estimator generator to generate them, and fit again. - if self.estimators: - super().fit(X, y) - - else: - self.le_ = LabelEncoder() - self.le_.fit(y) - self.clean_me =True - - if isinstance(y, np.ma.MaskedArray): - transformed_y = np.ma.MaskedArray(self.le_.transform(y), y.mask) - else: - # transformed_y = self.le_.transform(y) - transformed_y = y - - self.estimators_generator.fit(X, transformed_y) - self.estimators = [('ds{}'.format(i), estimator) for i, estimator in enumerate(self.estimators_generator.estimators_)] - super().fit(X, y) - - # Preparation and resolution of the quadratic program - # logger.info("Preparing and solving QP...") - self.weights = self._solve(X, y) - if self.clean_me: - self.estimators = [] - # print(self.weights.shape) - # print(np.unique(self.weights)[0:10]) - # import pdb;pdb.set_trace() - self.train_cbound = 1 - (1.0/X.shape[0])*(np.sum(np.multiply(change_label_to_minus(y), np.average(self._binary_classification_matrix(X), axis=1, weights=self.weights)))**2)/(np.sum(np.average(self._binary_classification_matrix(X), axis=1, weights=self.weights)**2)) - return self - - def _binary_classification_matrix(self, X): - probas = self.transform(X) - predicted_labels = np.argmax(probas, axis=2) - predicted_labels[predicted_labels == 0] = -1 - values = np.max(probas, axis=2) - return (predicted_labels * values).T - - def _multiclass_classification_matrix(self, X, y): - probas = self.transform(X).swapaxes(0, 1) - matrix = probas[np.arange(probas.shape[0]), :, y] - - return (matrix - self.omega) - - def predict(self, X): - if not self.estimators: - self.estimators = [('ds{}'.format(i), estimator) for i, estimator in - enumerate(self.estimators_generator.estimators_)] - self.clean_me = True - pred = super().predict(X) - if self.clean_me: - self.estimators = [] - return change_label_to_zero(pred) - - def _solve(self, X, y): - y = self.le_.transform(y) - - if self.use_binary: - assert len(self.le_.classes_) == 2 - - # TODO: Review the number of labeled examples when adding back the support for transductive learning. - classification_matrix = self._binary_classification_matrix(X) - - # We use {-1, 1} labels. - binary_labels = np.copy(y) - binary_labels[y == 0] = -1 - - multi_matrix = binary_labels.reshape((len(binary_labels), 1)) * classification_matrix - - else: - multi_matrix = self._multiclass_classification_matrix(X, y) - - n_examples, n_voters = np.shape(multi_matrix) - ftf = 1.0 / n_examples * multi_matrix.T.dot(multi_matrix) - yf = np.mean(multi_matrix, axis=0) - # Objective function. - objective_matrix = 2 * ftf - objective_vector = None +from ..Monoview.Additions.MinCQUtils import RegularizedBinaryMinCqClassifier +from ..Monoview.Additions.BoostUtils import StumpsClassifiersGenerator +from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomUniform - # Equality constraints (first moment of the margin equal to mu, Q sums to one) - equality_matrix = np.vstack((yf.reshape((1, n_voters)), np.ones((1, n_voters)))) - equality_vector = np.array([self.mu, 1.0]) - - # Lower and upper bounds, no quasi-uniformity. - lower_bound = 0.0 - # TODO: In the case of binary classification, no upper bound will give - # bad results. Using 1/n works, as it brings back the l_infinity - # regularization normally given by the quasi-uniformity constraint. - # upper_bound = 2.0/n_voters - upper_bound = None - - weights = self._solve_qp(objective_matrix, objective_vector, equality_matrix, equality_vector, lower_bound, upper_bound) - - # Keep learning information for further use. - self.learner_info_ = {} - - # We count the number of non-zero weights, including the implicit voters. - # TODO: Verify how we define non-zero weights here, could be if the weight is near 1/2n. - n_nonzero_weights = np.sum(np.asarray(weights) > 1e-12) - n_nonzero_weights += np.sum(np.asarray(weights) < 1.0 / len(self.estimators_) - 1e-12) - self.learner_info_.update(n_nonzero_weights=n_nonzero_weights) - - return weights - - def _solve_qp(self, objective_matrix, objective_vector, equality_matrix, equality_vector, lower_bound, upper_bound): - try: - qp = ConvexProgram() - qp.quadratic_func, qp.linear_func = objective_matrix, objective_vector - qp.add_equality_constraints(equality_matrix, equality_vector) - qp.add_lower_bound(lower_bound) - qp.add_upper_bound(upper_bound) - return qp.solve() - - except Exception: - # logger.warning("Error while solving the quadratic program.") - raise - - -class RegularizedBinaryMinCqClassifier(MinCqClassifier): - """MinCq, version published in [1] and [2], where the regularization comes from the enforced quasi-uniformity - of the posterior distributino on the symmetric hypothesis space. This version only works with {-1, 1} labels. - - [1] From PAC-Bayes Bounds to Quadratic Programs for Majority Votes (Laviolette et al., 2011) - [2] Risk Bounds for the Majority Vote: From a PAC-Bayesian Analysis to a Learning Algorithm (Germain et al., 2015) - - """ - def fit(self, X, y): - import time - beg = time.time() - # We first fit and learn the weights. - super().fit(X, y) - - # Validations - if isinstance(y, np.ma.MaskedArray): - assert len(self.classes_[np.where(np.logical_not(self.classes_.mask))]) == 2, "RegularizedBinaryMinCqClassifier: only supports binary classification." - else: - assert len(self.classes_), "RegularizedBinaryMinCqClassifier: only supports binary classification." - - # Then we "reverse" the negative weights and their associated voter's output. - for i, weight in enumerate(self.weights): - if weight < 0: - # logger.debug("Reversing decision of a binary voter") - self.weights[i] *= -1 - self.estimators_[i].reverse_decision() - end=time.time() - self.train_time = end-beg - return self - - def _solve(self, X, y): - if isinstance(y, np.ma.MaskedArray): - y = np.ma.MaskedArray(self.le_.transform(y), y.mask) - else: - y = self.le_.transform(y) - - classification_matrix = self._binary_classification_matrix(X) - n_examples, n_voters = np.shape(classification_matrix) - - if self.zeta == 0: - np.transpose(classification_matrix) - ftf = np.dot(np.transpose(classification_matrix),classification_matrix) - else: - I = np.eye(n_examples) - L = build_laplacian(X, n_neighbors=self.n_neighbors) - ftf = classification_matrix.T.dot(I + (self.zeta / n_examples) * L).dot(classification_matrix) - - # We use {-1, 1} labels. - binary_labels = np.ma.copy(y) - binary_labels[np.ma.where(y == 0)] = -1 - - # Objective function. - ftf_mean = np.mean(ftf, axis=1) - objective_matrix = 2.0 / n_examples * ftf - objective_vector = -1.0 / n_examples * ftf_mean.T - - # Equality constraint: first moment of the margin fixed to mu, only using labeled examples. - if isinstance(y, np.ma.MaskedArray): - labeled = np.where(np.logical_not(y.mask))[0] - binary_labels = binary_labels[labeled] - else: - labeled = range(len(y)) - - yf = binary_labels.T.dot(classification_matrix[labeled]) - yf_mean = np.mean(yf) - equality_matrix = 2.0 / len(labeled) * yf - equality_vector = self.mu + 1.0 / len(labeled) * yf_mean - - # Lower and upper bounds (quasi-uniformity constraints) - lower_bound = 0.0 - upper_bound = 1.0 / n_voters - - try: - weights = self._solve_qp(objective_matrix, objective_vector, equality_matrix, equality_vector, lower_bound, upper_bound) - except ValueError as e: - if "domain error" in e.args: - weights = np.ones(len(self.estimators_)) - - - # Keep learning information for further use. - self.learner_info_ = {} - print(np.unique(weights)) - - # We count the number of non-zero weights, including the implicit voters. - # TODO: Verify how we define non-zero weights here, could be if the weight is near 1/2n. - n_nonzero_weights = np.sum(np.asarray(weights) > 1e-12) - n_nonzero_weights += np.sum(np.asarray(weights) < 1.0 / len(self.estimators_) - 1e-12) - self.learner_info_.update(n_nonzero_weights=n_nonzero_weights) - - # Conversion of the weights of the n first voters to weights on the implicit 2n voters. - # See Section 7.1 of [2] for an explanation. - # return np.array([2 * q - 1.0 / len(self.estimators_) for q in weights]) - return np.array(weights) - -def build_laplacian(X, n_neighbors=None): - clf = SpectralEmbedding(n_neighbors=n_neighbors) - clf.fit(X) - w = clf.affinity_matrix_ - laplacian = graph_laplacian(w, normed=True) - return laplacian class MinCQGraalpy(RegularizedBinaryMinCqClassifier, BaseMonoviewClassifier): @@ -355,28 +62,4 @@ def paramsToSet(nIter, randomState): paramsSet = [] for _ in range(nIter): paramsSet.append({}) - return paramsSet - - -# if __name__ == '__main__': -# # Example usage. -# from sklearn.datasets import load_iris -# from sklearn.cross_validation import train_test_split -# from graalpy.utils.majority_vote import StumpsClassifiersGenerator -# -# # Load data, change {0, 1, 2} labels to {-1, 1} -# iris = load_iris() -# iris.target[np.where(iris.target == 0)] = -1 -# iris.target[np.where(iris.target == 2)] = 1 -# x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=42) -# -# # Fit MinCq -# clf = RegularizedBinaryMinCqClassifier(estimators_generator=StumpsClassifiersGenerator()) -# clf.fit(x_train, y_train) -# -# # Compare the best score of individual classifiers versus the score of the learned majority vote. -# print("Best training risk of individual voters: {:.4f}".format(1 - max([e.score(x_train, y_train) for e in clf.estimators_]))) -# print("Training risk of the majority vote outputted by MinCq: {:.4f}".format(1 - clf.score(x_train, y_train))) -# print() -# print("Best testing risk of individual voters: {:.4f}".format(1 - max([e.score(x_test, y_test) for e in clf.estimators_]))) -# print("Testing risk of the majority vote outputted by MinCq: {:.4f}".format(1 - clf.score(x_test, y_test))) + return paramsSet \ No newline at end of file diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpyTree.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpyTree.py new file mode 100644 index 00000000..4376f73b --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/MinCQGraalpyTree.py @@ -0,0 +1,71 @@ +import numpy as np + +from ..Monoview.Additions.MinCQUtils import RegularizedBinaryMinCqClassifier +from ..Monoview.Additions.BoostUtils import TreeClassifiersGenerator +from ..Monoview.MonoviewUtils import BaseMonoviewClassifier, CustomUniform + + + +class MinCQGraalpyTree(RegularizedBinaryMinCqClassifier, BaseMonoviewClassifier): + + def __init__(self, random_state=None, mu=0.01, self_complemented=True, n_stumps_per_attribute=1, max_depth=2, **kwargs): + super(MinCQGraalpyTree, self).__init__(mu=mu, + estimators_generator=TreeClassifiersGenerator(random_state=random_state, + n_trees=n_stumps_per_attribute, + max_depth=max_depth, + self_complemented=self_complemented), + ) + self.param_names = ["mu", "n_stumps_per_attribute", "random_state", "max_depth"] + self.distribs = [CustomUniform(loc=0.05, state=2.0, multiplier="e-"), + [n_stumps_per_attribute], [random_state], [max_depth]] + self.n_stumps_per_attribute = n_stumps_per_attribute + self.classed_params = [] + self.weird_strings = {} + self.max_depth = max_depth + self.random_state = random_state + if "nbCores" not in kwargs: + self.nbCores = 1 + else: + self.nbCores = kwargs["nbCores"] + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def set_params(self, **params): + self.mu = params["mu"] + self.random_state = params["random_state"] + self.n_stumps_per_attribute = params["n_stumps_per_attribute"] + self.max_depth = params["max_depth"] + return self + + def get_params(self, deep=True): + return {"random_state":self.random_state, "mu":self.mu, "n_stumps_per_attribute":self.n_stumps_per_attribute, "max_depth":self.max_depth} + + def getInterpret(self, directory, y_test): + interpret_string = "Cbound on train :"+str(self.train_cbound) + np.savetxt(directory+"times.csv", np.array([self.train_time, 0])) + # interpret_string += "Train C_bound value : "+str(self.cbound_train) + # y_rework = np.copy(y_test) + # y_rework[np.where(y_rework==0)] = -1 + # interpret_string += "\n Test c_bound value : "+str(self.majority_vote.cbound_value(self.x_test, y_rework)) + return interpret_string + + def get_name_for_fusion(self): + return "MCG" + + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"mu":args.MCGT_mu, + "n_stumps_per_attribute":args.MCGT_trees, + "max_depth":args.MCGT_max_depth} + return kwargsDict + + +def paramsToSet(nIter, randomState): + """Used for weighted linear early fusion to generate random search sets""" + paramsSet = [] + for _ in range(nIter): + paramsSet.append({}) + return paramsSet \ No newline at end of file diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoost.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoost.py index 51ac1180..039d237f 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoost.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoost.py @@ -1,6 +1,6 @@ from ..Monoview.MonoviewUtils import BaseMonoviewClassifier from ..Monoview.Additions.BoostUtils import getInterpretBase -from ..Monoview.Additions.QarBoostUtils import ColumnGenerationClassifierQar +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar class QarBoost(ColumnGenerationClassifierQar, BaseMonoviewClassifier): diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC3.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC3.py index d407d12e..63de1129 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC3.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostNC3.py @@ -1,6 +1,6 @@ from ..Monoview.MonoviewUtils import BaseMonoviewClassifier from ..Monoview.Additions.BoostUtils import getInterpretBase -from ..Monoview.Additions.QarBoostUtils import ColumnGenerationClassifierQar +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar class QarBoostNC3(ColumnGenerationClassifierQar, BaseMonoviewClassifier): diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv2.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv2.py index 1829f8c4..01cd5910 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv2.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv2.py @@ -1,6 +1,6 @@ from ..Monoview.MonoviewUtils import BaseMonoviewClassifier from ..Monoview.Additions.BoostUtils import getInterpretBase -from ..Monoview.Additions.QarBoostUtils import ColumnGenerationClassifierQar +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar class QarBoostv2(ColumnGenerationClassifierQar, BaseMonoviewClassifier): diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv3.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv3.py index 8b99a7a5..0b5b4182 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv3.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/QarBoostv3.py @@ -1,6 +1,6 @@ from ..Monoview.MonoviewUtils import BaseMonoviewClassifier from ..Monoview.Additions.BoostUtils import getInterpretBase -from ..Monoview.Additions.QarBoostUtils import ColumnGenerationClassifierQar +from ..Monoview.Additions.CGDescUtils import ColumnGenerationClassifierQar class QarBoostv3(ColumnGenerationClassifierQar, BaseMonoviewClassifier): diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregen.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregen.py index c6c6d196..f35c0d52 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregen.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregen.py @@ -2,6 +2,7 @@ from sklearn.externals.six import iteritems from pyscm.scm import SetCoveringMachineClassifier as scm from sklearn.base import BaseEstimator, ClassifierMixin import numpy as np +import os from ..Monoview.MonoviewUtils import CustomRandint, CustomUniform, BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero from ..Monoview.Additions.BoostUtils import StumpsClassifiersGenerator, BaseBoost @@ -28,51 +29,28 @@ class SCMPregen(scm, BaseMonoviewClassifier, PregenClassifier): self.weird_strings = {} self.self_complemented = self_complemented self.n_stumps = n_stumps - self.estimators_generator = None + self.estimators_generator = "Stumps" def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params): - pregen_X, pregen_y = self.pregen_voters(X, y) - super(SCMPregen, self).fit(pregen_X, pregen_y) + pregen_X, _ = self.pregen_voters(X, y) + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") + super(SCMPregen, self).fit(place_holder, y, tiebreaker=tiebreaker, iteration_callback=iteration_callback, **fit_params) return self def predict(self, X): - pregen_h, _ = self.pregen_voters(X) - from time import sleep;sleep(1) - return self.classes_[self.model_.predict(X)] + pregen_X, _ = self.pregen_voters(X) + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") + return self.classes_[self.model_.predict(place_holder)] def get_params(self, deep=True): return {"p": self.p, "model_type": self.model_type, "max_rules": self.max_rules, "random_state": self.random_state, "n_stumps":self.n_stumps} - # def pregen_voters(self, X, y=None): - # if y is not None: - # if self.estimators_generator is None: - # self.estimators_generator = StumpsClassifiersGenerator( - # n_stumps_per_attribute=self.n_stumps, - # self_complemented=self.self_complemented) - # self.estimators_generator.fit(X, y) - # else: - # neg_y=None - # classification_matrix = self._binary_classification_matrix_t(X) - # return classification_matrix, y - # - # def _collect_probas_t(self, X): - # print('jb') - # for est in self.estimators_generator.estimators_: - # print(type(est)) - # print(est.predict_proba_t(X)) - # print('ha') - # return np.asarray([clf.predict_proba(X) for clf in self.estimators_generator.estimators_]) - # - # def _binary_classification_matrix_t(self, X): - # probas = self._collect_probas_t(X) - # predicted_labels = np.argmax(probas, axis=2) - # predicted_labels[predicted_labels == 0] = -1 - # values = np.max(probas, axis=2) - # return (predicted_labels * values).T - - def canProbas(self): """Used to know if the classifier can return label probabilities""" return False diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregenTree.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregenTree.py new file mode 100644 index 00000000..ad735836 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMPregenTree.py @@ -0,0 +1,83 @@ +from sklearn.externals.six import iteritems +from pyscm.scm import SetCoveringMachineClassifier as scm +from sklearn.base import BaseEstimator, ClassifierMixin +import numpy as np +import os + +from ..Monoview.MonoviewUtils import CustomRandint, CustomUniform, BaseMonoviewClassifier, change_label_to_minus, change_label_to_zero +from ..Monoview.Additions.BoostUtils import StumpsClassifiersGenerator, BaseBoost +from ..Monoview.Additions.PregenUtils import PregenClassifier +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + +class SCMPregenTree(scm, BaseMonoviewClassifier, PregenClassifier): + + def __init__(self, random_state=None, model_type="conjunction", + max_rules=10, p=0.1, n_stumps=10,self_complemented=True, max_depth=2, **kwargs): + super(SCMPregenTree, self).__init__( + random_state=random_state, + model_type=model_type, + max_rules=max_rules, + p=p + ) + self.param_names = ["model_type", "max_rules", "p", "n_stumps", "random_state", "max_depth"] + self.distribs = [["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), [n_stumps], [random_state], [max_depth]] + self.classed_params = [] + self.weird_strings = {} + self.max_depth=max_depth + self.self_complemented = self_complemented + self.random_state = random_state + self.n_stumps = n_stumps + self.estimators_generator = "Stumps" + + def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params): + pregen_X, _ = self.pregen_voters(X, y, generator="Trees") + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") + super(SCMPregenTree, self).fit(place_holder, y, tiebreaker=tiebreaker, iteration_callback=iteration_callback, **fit_params) + return self + + def predict(self, X): + pregen_X, _ = self.pregen_voters(X, generator="Trees") + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") + return self.classes_[self.model_.predict(place_holder)] + + def get_params(self, deep=True): + return {"p": self.p, "model_type": self.model_type, + "max_rules": self.max_rules, + "random_state": self.random_state, "n_stumps":self.n_stumps, "max_depth":self.max_depth} + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return False + + def getInterpret(self, directory, y_test): + interpretString = "Model used : " + str(self.model_) + return interpretString + + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"model_type": args.SCPT_model_type, + "p": args.SCPT_p, + "max_rules": args.SCPT_max_rules, + "n_stumps": args.SCPT_trees, + "max_depth":args.SCPT_max_depth} + return kwargsDict + + +def paramsToSet(nIter, randomState): + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"model_type": randomState.choice(["conjunction", "disjunction"]), + "max_rules": randomState.randint(1, 15), + "p": randomState.random_sample()}) + return paramsSet + + diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsity.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsity.py index 5a945453..31951785 100644 --- a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsity.py +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsity.py @@ -3,51 +3,21 @@ from pyscm.scm import SetCoveringMachineClassifier as scm from sklearn.base import BaseEstimator, ClassifierMixin import numpy as np import time +import os from ..Monoview.MonoviewUtils import CustomRandint, CustomUniform, BaseMonoviewClassifier +from ..Monoview.Additions.PregenUtils import PregenClassifier from ..Metrics import zero_one_loss # Author-Info __author__ = "Baptiste Bauvin" __status__ = "Prototype" # Production, Development, Prototype -# class DecisionStumpSCMNew(scm, BaseEstimator, ClassifierMixin): -# """docstring for SCM -# A hands on class of SCM using decision stump, built with sklearn format in order to use sklearn function on SCM like -# CV, gridsearch, and so on ...""" -# -# def __init__(self, model_type='conjunction', p=0.1, max_rules=10, random_state=42): -# super(DecisionStumpSCMNew, self).__init__(model_type=model_type, max_rules=max_rules, p=p, random_state=random_state) -# # self.model_type = model_type -# # self.p = p -# # self.max_rules = max_rules -# # self.random_state = random_state -# # self.clf = scm(model_type=self.model_type, max_rules=self.max_rules, p=self.p, random_state=self.random_state) -# -# # def fit(self, X, y): -# # print(self.clf.model_type) -# # self.clf.fit(X=X, y=y) -# # -# # def predict(self, X): -# # return self.clf.predict(X) -# # -# # def set_params(self, **params): -# # for key, value in iteritems(params): -# # if key == 'p': -# # self.p = value -# # if key == 'model_type': -# # self.model_type = value -# # if key == 'max_rules': -# # self.max_rules = value -# -# # def get_stats(self): -# # return {"Binary_attributes": self.clf.model_.rules} - -class SCMSparsity(BaseMonoviewClassifier): +class SCMSparsity(BaseMonoviewClassifier, PregenClassifier): def __init__(self, random_state=None, model_type="conjunction", - max_rules=10, p=0.1, **kwargs): + max_rules=10, p=0.1, n_stumps=1, self_complemented=True, **kwargs): self.scm_estimators = [scm( random_state=random_state, model_type=model_type, @@ -55,30 +25,40 @@ class SCMSparsity(BaseMonoviewClassifier): p=p ) for max_rule in range(max_rules)] self.model_type = model_type + self.self_complemented = self_complemented + self.n_stumps = n_stumps self.p = p self.random_state = random_state self.max_rules = max_rules - self.param_names = ["model_type", "max_rules", "p", "random_state"] + self.param_names = ["model_type", "max_rules", "p", "random_state", "n_stumps"] self.distribs = [["conjunction", "disjunction"], CustomRandint(low=1, high=15), - CustomUniform(loc=0, state=1), [random_state]] + CustomUniform(loc=0, state=1), [random_state], [n_stumps]] self.classed_params = [] self.weird_strings = {} def get_params(self): - return {"model_type":self.model_type, "p":self.p, "max_rules":self.max_rules, "random_state":self.random_state} + return {"model_type":self.model_type, "p":self.p, "max_rules":self.max_rules, "random_state":self.random_state, "n_stumps":self.n_stumps} def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params): + pregen_X, _ = self.pregen_voters(X, y) + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") for scm_estimator in self.scm_estimators: beg = time.time() - scm_estimator.fit(X, y, tiebreaker=None, iteration_callback=None, **fit_params) + scm_estimator.fit(place_holder, y, tiebreaker=None, iteration_callback=None, **fit_params) end = time.time() self.times = np.array([end-beg, 0]) self.train_metrics = [zero_one_loss.score(y, scm_estimator.predict(X)) for scm_estimator in self.scm_estimators] return self.scm_estimators[-1] def predict(self, X): - self.preds = [scm_estimator.predict(X) for scm_estimator in self.scm_estimators] + pregen_X, _ = self.pregen_voters(X,) + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") + self.preds = [scm_estimator.predict(place_holder) for scm_estimator in self.scm_estimators] return self.preds[-1] def canProbas(self): @@ -97,7 +77,8 @@ def formatCmdArgs(args): """Used to format kwargs for the parsed args""" kwargsDict = {"model_type": args.SCS_model_type, "p": args.SCS_p, - "max_rules": args.SCS_max_rules} + "max_rules": args.SCS_max_rules, + "n_stumps": args.SCS_stumps} return kwargsDict diff --git a/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsityTree.py b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsityTree.py new file mode 100644 index 00000000..4e717c38 --- /dev/null +++ b/multiview_platform/MonoMultiViewClassifiers/MonoviewClassifiers/SCMSparsityTree.py @@ -0,0 +1,92 @@ +from sklearn.externals.six import iteritems +from pyscm.scm import SetCoveringMachineClassifier as scm +from sklearn.base import BaseEstimator, ClassifierMixin +import numpy as np +import time +import os + +from ..Monoview.MonoviewUtils import CustomRandint, CustomUniform, BaseMonoviewClassifier +from ..Monoview.Additions.PregenUtils import PregenClassifier +from ..Metrics import zero_one_loss + +# Author-Info +__author__ = "Baptiste Bauvin" +__status__ = "Prototype" # Production, Development, Prototype + + +class SCMSparsityTree(BaseMonoviewClassifier, PregenClassifier): + + def __init__(self, random_state=None, model_type="conjunction", + max_rules=10, p=0.1, n_stumps=1, max_depth=2, **kwargs): + self.scm_estimators = [scm( + random_state=random_state, + model_type=model_type, + max_rules=max_rule+1, + p=p + ) for max_rule in range(max_rules)] + self.model_type = model_type + self.max_depth=max_depth + self.p = p + self.n_stumps = n_stumps + self.random_state = random_state + self.max_rules = max_rules + self.param_names = ["model_type", "max_rules", "p", "random_state", "max_depth"] + self.distribs = [["conjunction", "disjunction"], + CustomRandint(low=1, high=15), + CustomUniform(loc=0, state=1), [random_state], [max_depth]] + self.classed_params = [] + self.weird_strings = {} + + def get_params(self): + return {"model_type":self.model_type, "p":self.p, "max_rules":self.max_rules, "random_state":self.random_state, "max_depth":self.max_depth, "n_stumps":self.n_stumps} + + def fit(self, X, y, tiebreaker=None, iteration_callback=None, **fit_params): + pregen_X, _ = self.pregen_voters(X, y, generator="Trees") + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") + for scm_estimator in self.scm_estimators: + beg = time.time() + scm_estimator.fit(place_holder, y, tiebreaker=None, iteration_callback=None, **fit_params) + end = time.time() + self.times = np.array([end-beg, 0]) + self.train_metrics = [zero_one_loss.score(y, scm_estimator.predict(X)) for scm_estimator in self.scm_estimators] + return self.scm_estimators[-1] + + def predict(self, X): + pregen_X, _ = self.pregen_voters(X, generator="Trees") + np.savetxt("pregen_x.csv", pregen_X, delimiter=',') + place_holder = np.genfromtxt("pregen_x.csv", delimiter=',') + os.remove("pregen_x.csv") + self.preds = [scm_estimator.predict(place_holder) for scm_estimator in self.scm_estimators] + return self.preds[-1] + + def canProbas(self): + """Used to know if the classifier can return label probabilities""" + return True + + def getInterpret(self, directory, y_test): + interpretString = "" + np.savetxt(directory+"test_metrics.csv", np.array([zero_one_loss.score(y_test, pred) for pred in self.preds])) + np.savetxt(directory + "times.csv", self.times) + np.savetxt(directory + "train_metrics.csv", self.train_metrics) + return interpretString + + +def formatCmdArgs(args): + """Used to format kwargs for the parsed args""" + kwargsDict = {"model_type": args.SCST_model_type, + "p": args.SCST_p, + "max_rules": args.SCST_max_rules, + "n_stumps": args.SCST_trees, + "max_depth": args.SCST_max_depth} + return kwargsDict + + +def paramsToSet(nIter, randomState): + paramsSet = [] + for _ in range(nIter): + paramsSet.append({"model_type": randomState.choice(["conjunction", "disjunction"]), + "max_rules": randomState.randint(1, 15), + "p": randomState.random_sample()}) + return paramsSet diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py b/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py index dafb99d0..6484589a 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/GetMultiviewDb.py @@ -7,6 +7,8 @@ import h5py import operator import errno import csv +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_array # Author-Info __author__ = "Baptiste Bauvin" @@ -22,6 +24,31 @@ def copyHDF5(pathF, name, nbCores): datasetFile.copy("/" + dataset, newDataSet["/"]) newDataSet.close() +class TanhNormalizer(BaseEstimator, TransformerMixin): + """Normalize data using a tanh function. This is the normalizer used in the so-called "Never-ending paper". + It remains here for reproduceability purposes, but you should use Scikit-Learn normalizers instead! + + """ + def __init__(self): + self.mean = None + self.std = None + + def fit(self, X, y=None, **fit_params): + X = check_array(X) + self.mean = X.mean(0) + self.mean.shape = (1, len(self.mean)) + self.std = X.std(0) + self.std[self.std == 0] = 1 + self.std.shape = (1, len(self.std)) + return self + + def transform(self, X): + return np.tanh((X - self.mean) / self.std) + + def fit_transform(self, X, y=None, **fit_params): + self.fit(X, **fit_params) + return self.transform(X) + diff --git a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py index 3dce0a8f..da803b97 100644 --- a/multiview_platform/MonoMultiViewClassifiers/utils/execution.py +++ b/multiview_platform/MonoMultiViewClassifiers/utils/execution.py @@ -137,6 +137,8 @@ def parseTheArgs(arguments): help='Number of stumps inthe pregenerated dataset', default=1) + + groupAdaboostGraalpy = parser.add_argument_group('AdaboostGraalpy arguments') groupAdaboostGraalpy.add_argument('--AdG_n_iter', metavar='INT', type=int, action='store', @@ -199,6 +201,9 @@ def parseTheArgs(arguments): groupSCMSparsity.add_argument('--SCS_max_rules', metavar='INT', type=int, action='store', help='Max number of rules for SCM', default=1) + groupSCMSparsity.add_argument('--SCS_stumps', metavar='INT', type=int, + action='store', + help='Number of stumps', default=1) groupSCMSparsity.add_argument('--SCS_p', metavar='FLOAT', type=float, action='store', help='Max number of rules for SCM', default=1.0) @@ -217,6 +222,8 @@ def parseTheArgs(arguments): help='Set the number of stumps for CQBoost', default=1) + + groupCQBoostv2 = parser.add_argument_group('CQBoostv2 arguments') groupCQBoostv2.add_argument('--CQB2_mu', metavar='FLOAT', type=float, action='store', help='Set the mu parameter for CQBoostv2', default=0.002) @@ -251,6 +258,117 @@ def parseTheArgs(arguments): help='Set the n_max_iterations parameter for CGreed', default=100) + groupCGDescTree = parser.add_argument_group('CGDesc arguments') + groupCGDescTree.add_argument('--CGDT_trees', metavar='INT', type=int, + action='store', + help='Set thenumber of trees for CGreed', + default=100) + groupCGDescTree.add_argument('--CGDT_n_iter', metavar='INT', type=int, + action='store', + help='Set the n_max_iterations parameter for CGreed', + default=100) + groupCGDescTree.add_argument('--CGDT_max_depth', metavar='INT', type=int, + action='store', + help='Set the n_max_iterations parameter for CGreed', + default=2) + + groupMinCQGraalpyTree = parser.add_argument_group( + 'MinCQGraalpyTree arguments') + groupMinCQGraalpyTree.add_argument('--MCGT_mu', metavar='FLOAT', type=float, + action='store', + help='Set the mu_parameter for MinCQGraalpy', + default=0.05) + groupMinCQGraalpyTree.add_argument('--MCGT_trees', metavar='INT', type=int, + action='store', + help='Set the n trees parameter for MinCQGraalpy', + default=100) + groupMinCQGraalpyTree.add_argument('--MCGT_max_depth', metavar='INT', + type=int, + action='store', + help='Set the n_stumps_per_attribute parameter for MinCQGraalpy', + default=2) + + groupCQBoostTree = parser.add_argument_group('CQBoostTree arguments') + groupCQBoostTree.add_argument('--CQBT_mu', metavar='FLOAT', type=float, + action='store', + help='Set the mu parameter for CQBoost', + default=0.001) + groupCQBoostTree.add_argument('--CQBT_epsilon', metavar='FLOAT', type=float, + action='store', + help='Set the epsilon parameter for CQBoost', + default=1e-06) + groupCQBoostTree.add_argument('--CQBT_trees', metavar='INT', type=int, + action='store', + help='Set the number of trees for CQBoost', + default=100) + groupCQBoostTree.add_argument('--CQBT_max_depth', metavar='INT', type=int, + action='store', + help='Set the number of stumps for CQBoost', + default=2) + groupSCMPregenTree = parser.add_argument_group('SCMPregenTree arguments') + groupSCMPregenTree.add_argument('--SCPT_max_rules', metavar='INT', type=int, + action='store', + help='Max number of rules for SCM', default=1) + groupSCMPregenTree.add_argument('--SCPT_p', metavar='FLOAT', type=float, + action='store', + help='Max number of rules for SCM', default=1.0) + groupSCMPregenTree.add_argument('--SCPT_model_type', metavar='STRING', + action='store', + help='Max number of rules for SCM', + default="conjunction") + groupSCMPregenTree.add_argument('--SCPT_trees', metavar='INT', type=int, + action='store', + help='Number of stumps per attribute', + default=100) + groupSCMPregenTree.add_argument('--SCPT_max_depth', metavar='INT', type=int, + action='store', + help='Max_depth of the trees', + default=1) + + groupSCMSparsityTree = parser.add_argument_group('SCMSparsityTree arguments') + groupSCMSparsityTree.add_argument('--SCST_max_rules', metavar='INT', type=int, + action='store', + help='Max number of rules for SCM', + default=1) + groupSCMSparsityTree.add_argument('--SCST_p', metavar='FLOAT', type=float, + action='store', + help='Max number of rules for SCM', + default=1.0) + groupSCMSparsityTree.add_argument('--SCST_model_type', metavar='STRING', + action='store', + help='Max number of rules for SCM', + default="conjunction") + groupSCMSparsityTree.add_argument('--SCST_trees', metavar='INT', type=int, + action='store', + help='Number of stumps per attribute', + default=100) + groupSCMSparsityTree.add_argument('--SCST_max_depth', metavar='INT', type=int, + action='store', + help='Max_depth of the trees', + default=1) + + groupAdaboostPregenTree = parser.add_argument_group( + 'AdaboostPregenTrees arguments') + groupAdaboostPregenTree.add_argument('--AdPT_n_est', metavar='INT', + type=int, + action='store', + help='Number of estimators', + default=100) + groupAdaboostPregenTree.add_argument('--AdPT_b_est', metavar='STRING', + action='store', + help='Estimators', + default='DecisionTreeClassifier') + groupAdaboostPregenTree.add_argument('--AdPT_trees', metavar='INT', + type=int, + action='store', + help='Number of trees in the pregenerated dataset', + default=100) + groupAdaboostPregenTree.add_argument('--AdPT_max_depth', metavar='INT', + type=int, + action='store', + help='Number of stumps inthe pregenerated dataset', + default=2) + groupLasso = parser.add_argument_group('Lasso arguments') groupLasso.add_argument('--LA_n_iter', metavar='INT', type=int, action='store', @@ -287,6 +405,8 @@ def parseTheArgs(arguments): help='Set the n_stumps_per_attribute parameter for MinCQGraalpy', default=1) + + groupQarBoostv3 = parser.add_argument_group('QarBoostv3 arguments') groupQarBoostv3.add_argument('--QarB3_mu', metavar='FLOAT', type=float, action='store', help='Set the mu parameter for QarBoostv3', default=0.001) -- GitLab